Fix Standarization
This commit is contained in:
111
notebook.ipynb
111
notebook.ipynb
@@ -15,7 +15,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 16,
|
||||
"id": "3116b75f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -104,7 +104,6 @@
|
||||
"- AppId\n",
|
||||
"- Name of the Game\n",
|
||||
"- Realease Date\n",
|
||||
"- Reviews\n",
|
||||
"- Header Image\n",
|
||||
"- Website\n",
|
||||
"- Support URL\n",
|
||||
@@ -119,29 +118,84 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 17,
|
||||
"id": "06dedcdf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "KeyError",
|
||||
"evalue": "\"['developer', 'publisher'] not found in axis\"",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mappid\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mname\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mrelease_date\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mreviews\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mheader_image\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mwebsite\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43msupport_url\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43msupport_email\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mmetacritic_url\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mdeveloper\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mpublisher\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mscreenshots\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mmovies\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mestimated_owners\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(dataset.head())\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\pandas\\core\\frame.py:5581\u001b[39m, in \u001b[36mDataFrame.drop\u001b[39m\u001b[34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[39m\n\u001b[32m 5433\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdrop\u001b[39m(\n\u001b[32m 5434\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 5435\u001b[39m labels: IndexLabel | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m (...)\u001b[39m\u001b[32m 5442\u001b[39m errors: IgnoreRaise = \u001b[33m\"\u001b[39m\u001b[33mraise\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 5443\u001b[39m ) -> DataFrame | \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 5444\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 5445\u001b[39m \u001b[33;03m Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[32m 5446\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 5579\u001b[39m \u001b[33;03m weight 1.0 0.8\u001b[39;00m\n\u001b[32m 5580\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m5581\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 5582\u001b[39m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5583\u001b[39m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5584\u001b[39m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m=\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5585\u001b[39m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5586\u001b[39m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5587\u001b[39m \u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[43m=\u001b[49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5588\u001b[39m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5589\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\pandas\\core\\generic.py:4788\u001b[39m, in \u001b[36mNDFrame.drop\u001b[39m\u001b[34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[39m\n\u001b[32m 4786\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes.items():\n\u001b[32m 4787\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m4788\u001b[39m obj = \u001b[43mobj\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4790\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[32m 4791\u001b[39m \u001b[38;5;28mself\u001b[39m._update_inplace(obj)\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\pandas\\core\\generic.py:4830\u001b[39m, in \u001b[36mNDFrame._drop_axis\u001b[39m\u001b[34m(self, labels, axis, level, errors, only_slice)\u001b[39m\n\u001b[32m 4828\u001b[39m new_axis = axis.drop(labels, level=level, errors=errors)\n\u001b[32m 4829\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m4830\u001b[39m new_axis = \u001b[43maxis\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4831\u001b[39m indexer = axis.get_indexer(new_axis)\n\u001b[32m 4833\u001b[39m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[32m 4834\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:7070\u001b[39m, in \u001b[36mIndex.drop\u001b[39m\u001b[34m(self, labels, errors)\u001b[39m\n\u001b[32m 7068\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m mask.any():\n\u001b[32m 7069\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m errors != \u001b[33m\"\u001b[39m\u001b[33mignore\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m-> \u001b[39m\u001b[32m7070\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlabels[mask].tolist()\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not found in axis\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 7071\u001b[39m indexer = indexer[~mask]\n\u001b[32m 7072\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.delete(indexer)\n",
|
||||
"\u001b[31mKeyError\u001b[39m: \"['developer', 'publisher'] not found in axis\""
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" required_age price dlc_count \\\n",
|
||||
"0 0 0.00 1 \n",
|
||||
"1 0 0.00 0 \n",
|
||||
"2 0 0.00 2 \n",
|
||||
"3 17 0.00 0 \n",
|
||||
"4 17 3.99 9 \n",
|
||||
"\n",
|
||||
" detailed_description \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... \n",
|
||||
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
|
||||
"2 The most-played game on Steam. Every day, mill... \n",
|
||||
"3 When a young street hustler, a retired bank ro... \n",
|
||||
"4 Edition Comparison Ultimate Edition The Tom Cl... \n",
|
||||
"\n",
|
||||
" about_the_game \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... \n",
|
||||
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
|
||||
"2 The most-played game on Steam. Every day, mill... \n",
|
||||
"3 When a young street hustler, a retired bank ro... \n",
|
||||
"4 “One of the best first-person shooters ever ma... \n",
|
||||
"\n",
|
||||
" short_description \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... \n",
|
||||
"1 Play PUBG: BATTLEGROUNDS for free. Land on str... \n",
|
||||
"2 Every day, millions of players worldwide enter... \n",
|
||||
"3 Grand Theft Auto V for PC offers players the o... \n",
|
||||
"4 Tom Clancy's Rainbow Six® Siege is an elite, t... \n",
|
||||
"\n",
|
||||
" reviews windows mac linux \\\n",
|
||||
"0 NaN True False True \n",
|
||||
"1 NaN True False False \n",
|
||||
"2 “A modern multiplayer masterpiece.” 9.5/10 – D... True True True \n",
|
||||
"3 NaN True False False \n",
|
||||
"4 NaN True False False \n",
|
||||
"\n",
|
||||
" ... average_playtime_2weeks median_playtime_forever \\\n",
|
||||
"0 ... 879 5174 \n",
|
||||
"1 ... 0 0 \n",
|
||||
"2 ... 1536 898 \n",
|
||||
"3 ... 771 7101 \n",
|
||||
"4 ... 682 2434 \n",
|
||||
"\n",
|
||||
" median_playtime_2weeks discount peak_ccu \\\n",
|
||||
"0 350 0 1212356 \n",
|
||||
"1 0 0 616738 \n",
|
||||
"2 892 0 555977 \n",
|
||||
"3 74 0 117698 \n",
|
||||
"4 306 80 89916 \n",
|
||||
"\n",
|
||||
" tags pct_pos_total \\\n",
|
||||
"0 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... 86 \n",
|
||||
"1 {'Survival': 14838, 'Shooter': 12727, 'Battle ... 59 \n",
|
||||
"2 {'Free to Play': 59933, 'MOBA': 20158, 'Multip... 81 \n",
|
||||
"3 {'Open World': 32644, 'Action': 23539, 'Multip... 87 \n",
|
||||
"4 {'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '... 84 \n",
|
||||
"\n",
|
||||
" num_reviews_total pct_pos_recent num_reviews_recent \n",
|
||||
"0 8632939 82 96473 \n",
|
||||
"1 2513842 68 16720 \n",
|
||||
"2 2452595 80 29366 \n",
|
||||
"3 1803832 92 17517 \n",
|
||||
"4 1168020 76 12608 \n",
|
||||
"\n",
|
||||
"[5 rows x 34 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n",
|
||||
"dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'developer', 'publisher', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n",
|
||||
"dataset.drop(['appid', 'name', 'release_date', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n",
|
||||
"print(dataset.head())"
|
||||
]
|
||||
},
|
||||
@@ -155,8 +209,8 @@
|
||||
"The dataset holds a lot of unstructured data, we use Term Frequency-Inverse Document Frequency to structurize most Text-Features.\n",
|
||||
"It is important to use an new Instance for each feature so they don't overlap with each other. \n",
|
||||
"\n",
|
||||
"### Standardize Numbers\n",
|
||||
"We standardize the prices so they can "
|
||||
"### Standardize Values\n",
|
||||
"We standardize only the text features to remove the stop words. The dataset allready provides standardized numerical features."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -169,24 +223,21 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[1. 1. 1.]]\n"
|
||||
"[[1. 1. 1. 1.]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.compose import make_column_transformer\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"# types,desc_snippet,recent_reviews,all_reviews,release_date,popular_tags,game_details,languages,achievements,genre,game_description,mature_content,minimum_requirements,recommended_requirements,original_price,discount_price\n",
|
||||
"# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n",
|
||||
"column_transformer = make_column_transformer(\n",
|
||||
" (TfidfVectorizer(stop_words='english'), ['desc_snippet']),\n",
|
||||
" (TfidfVectorizer(stop_words='english'), ['mature_content']),\n",
|
||||
" (TfidfVectorizer(stop_words='english'), ['game_description']),\n",
|
||||
" (StandardScaler(), ['original_price','discount_price']) # use the same scaling for both\n",
|
||||
" ('passthrough', ['price']),\n",
|
||||
" #TODO: add transformer for every feature @flo @max\n",
|
||||
" #TODO: check why not working\n",
|
||||
" (TfidfVectorizer(stop_words='english'), ['detailed_description']),\n",
|
||||
" (TfidfVectorizer(stop_words='english'), ['about_the_game']),\n",
|
||||
" (TfidfVectorizer(stop_words='english'), ['short_description']),\n",
|
||||
" (TfidfVectorizer(stop_words='english'), ['reviews']),\n",
|
||||
")\n",
|
||||
"#\n",
|
||||
"\n",
|
||||
"dataset2 = column_transformer.fit_transform(dataset)\n",
|
||||
"print(dataset2)"
|
||||
]
|
||||
@@ -205,7 +256,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@@ -219,7 +270,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.3"
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
Reference in New Issue
Block a user