Fix Standarization

This commit is contained in:
FlorianSpeicher
2025-08-11 22:09:25 +02:00
parent c9973ca964
commit 67229da6ba

View File

@@ -15,7 +15,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 16,
"id": "3116b75f", "id": "3116b75f",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -104,7 +104,6 @@
"- AppId\n", "- AppId\n",
"- Name of the Game\n", "- Name of the Game\n",
"- Realease Date\n", "- Realease Date\n",
"- Reviews\n",
"- Header Image\n", "- Header Image\n",
"- Website\n", "- Website\n",
"- Support URL\n", "- Support URL\n",
@@ -119,29 +118,84 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 17,
"id": "06dedcdf", "id": "06dedcdf",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"ename": "KeyError", "name": "stdout",
"evalue": "\"['developer', 'publisher'] not found in axis\"", "output_type": "stream",
"output_type": "error", "text": [
"traceback": [ " required_age price dlc_count \\\n",
"\u001b[31m---------------------------------------------------------------------------\u001b[39m", "0 0 0.00 1 \n",
"\u001b[31mKeyError\u001b[39m Traceback (most recent call last)", "1 0 0.00 0 \n",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mappid\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mname\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mrelease_date\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mreviews\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mheader_image\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mwebsite\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43msupport_url\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43msupport_email\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mmetacritic_url\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mdeveloper\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mpublisher\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mscreenshots\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mmovies\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mestimated_owners\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(dataset.head())\n", "2 0 0.00 2 \n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\pandas\\core\\frame.py:5581\u001b[39m, in \u001b[36mDataFrame.drop\u001b[39m\u001b[34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[39m\n\u001b[32m 5433\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdrop\u001b[39m(\n\u001b[32m 5434\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 5435\u001b[39m labels: IndexLabel | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m (...)\u001b[39m\u001b[32m 5442\u001b[39m errors: IgnoreRaise = \u001b[33m\"\u001b[39m\u001b[33mraise\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 5443\u001b[39m ) -> DataFrame | \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 5444\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 5445\u001b[39m \u001b[33;03m Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[32m 5446\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 5579\u001b[39m \u001b[33;03m weight 1.0 0.8\u001b[39;00m\n\u001b[32m 5580\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m5581\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 5582\u001b[39m \u001b[43m \u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5583\u001b[39m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5584\u001b[39m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m=\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5585\u001b[39m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5586\u001b[39m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5587\u001b[39m \u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[43m=\u001b[49m\u001b[43minplace\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5588\u001b[39m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5589\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", "3 17 0.00 0 \n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\pandas\\core\\generic.py:4788\u001b[39m, in \u001b[36mNDFrame.drop\u001b[39m\u001b[34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[39m\n\u001b[32m 4786\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m axis, labels \u001b[38;5;129;01min\u001b[39;00m axes.items():\n\u001b[32m 4787\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m4788\u001b[39m obj = \u001b[43mobj\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_drop_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4790\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m inplace:\n\u001b[32m 4791\u001b[39m \u001b[38;5;28mself\u001b[39m._update_inplace(obj)\n", "4 17 3.99 9 \n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\pandas\\core\\generic.py:4830\u001b[39m, in \u001b[36mNDFrame._drop_axis\u001b[39m\u001b[34m(self, labels, axis, level, errors, only_slice)\u001b[39m\n\u001b[32m 4828\u001b[39m new_axis = axis.drop(labels, level=level, errors=errors)\n\u001b[32m 4829\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m4830\u001b[39m new_axis = \u001b[43maxis\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdrop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4831\u001b[39m indexer = axis.get_indexer(new_axis)\n\u001b[32m 4833\u001b[39m \u001b[38;5;66;03m# Case for non-unique axis\u001b[39;00m\n\u001b[32m 4834\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", "\n",
"\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:7070\u001b[39m, in \u001b[36mIndex.drop\u001b[39m\u001b[34m(self, labels, errors)\u001b[39m\n\u001b[32m 7068\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m mask.any():\n\u001b[32m 7069\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m errors != \u001b[33m\"\u001b[39m\u001b[33mignore\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m-> \u001b[39m\u001b[32m7070\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlabels[mask].tolist()\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not found in axis\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 7071\u001b[39m indexer = indexer[~mask]\n\u001b[32m 7072\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.delete(indexer)\n", " detailed_description \\\n",
"\u001b[31mKeyError\u001b[39m: \"['developer', 'publisher'] not found in axis\"" "0 For over two decades, Counter-Strike has offer... \n",
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
"2 The most-played game on Steam. Every day, mill... \n",
"3 When a young street hustler, a retired bank ro... \n",
"4 Edition Comparison Ultimate Edition The Tom Cl... \n",
"\n",
" about_the_game \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
"2 The most-played game on Steam. Every day, mill... \n",
"3 When a young street hustler, a retired bank ro... \n",
"4 “One of the best first-person shooters ever ma... \n",
"\n",
" short_description \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"1 Play PUBG: BATTLEGROUNDS for free. Land on str... \n",
"2 Every day, millions of players worldwide enter... \n",
"3 Grand Theft Auto V for PC offers players the o... \n",
"4 Tom Clancy's Rainbow Six® Siege is an elite, t... \n",
"\n",
" reviews windows mac linux \\\n",
"0 NaN True False True \n",
"1 NaN True False False \n",
"2 “A modern multiplayer masterpiece.” 9.5/10 D... True True True \n",
"3 NaN True False False \n",
"4 NaN True False False \n",
"\n",
" ... average_playtime_2weeks median_playtime_forever \\\n",
"0 ... 879 5174 \n",
"1 ... 0 0 \n",
"2 ... 1536 898 \n",
"3 ... 771 7101 \n",
"4 ... 682 2434 \n",
"\n",
" median_playtime_2weeks discount peak_ccu \\\n",
"0 350 0 1212356 \n",
"1 0 0 616738 \n",
"2 892 0 555977 \n",
"3 74 0 117698 \n",
"4 306 80 89916 \n",
"\n",
" tags pct_pos_total \\\n",
"0 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... 86 \n",
"1 {'Survival': 14838, 'Shooter': 12727, 'Battle ... 59 \n",
"2 {'Free to Play': 59933, 'MOBA': 20158, 'Multip... 81 \n",
"3 {'Open World': 32644, 'Action': 23539, 'Multip... 87 \n",
"4 {'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '... 84 \n",
"\n",
" num_reviews_total pct_pos_recent num_reviews_recent \n",
"0 8632939 82 96473 \n",
"1 2513842 68 16720 \n",
"2 2452595 80 29366 \n",
"3 1803832 92 17517 \n",
"4 1168020 76 12608 \n",
"\n",
"[5 rows x 34 columns]\n"
] ]
} }
], ],
"source": [ "source": [
"# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n", "# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n",
"dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'developer', 'publisher', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n", "dataset.drop(['appid', 'name', 'release_date', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n",
"print(dataset.head())" "print(dataset.head())"
] ]
}, },
@@ -155,8 +209,8 @@
"The dataset holds a lot of unstructured data, we use Term Frequency-Inverse Document Frequency to structurize most Text-Features.\n", "The dataset holds a lot of unstructured data, we use Term Frequency-Inverse Document Frequency to structurize most Text-Features.\n",
"It is important to use an new Instance for each feature so they don't overlap with each other. \n", "It is important to use an new Instance for each feature so they don't overlap with each other. \n",
"\n", "\n",
"### Standardize Numbers\n", "### Standardize Values\n",
"We standardize the prices so they can " "We standardize only the text features to remove the stop words. The dataset allready provides standardized numerical features."
] ]
}, },
{ {
@@ -169,24 +223,21 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[[1. 1. 1.]]\n" "[[1. 1. 1. 1.]]\n"
] ]
} }
], ],
"source": [ "source": [
"from sklearn.compose import make_column_transformer\n", "from sklearn.compose import make_column_transformer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n",
"# types,desc_snippet,recent_reviews,all_reviews,release_date,popular_tags,game_details,languages,achievements,genre,game_description,mature_content,minimum_requirements,recommended_requirements,original_price,discount_price\n", "# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n",
"column_transformer = make_column_transformer(\n", "column_transformer = make_column_transformer(\n",
" (TfidfVectorizer(stop_words='english'), ['desc_snippet']),\n", " (TfidfVectorizer(stop_words='english'), ['detailed_description']),\n",
" (TfidfVectorizer(stop_words='english'), ['mature_content']),\n", " (TfidfVectorizer(stop_words='english'), ['about_the_game']),\n",
" (TfidfVectorizer(stop_words='english'), ['game_description']),\n", " (TfidfVectorizer(stop_words='english'), ['short_description']),\n",
" (StandardScaler(), ['original_price','discount_price']) # use the same scaling for both\n", " (TfidfVectorizer(stop_words='english'), ['reviews']),\n",
" ('passthrough', ['price']),\n",
" #TODO: add transformer for every feature @flo @max\n",
" #TODO: check why not working\n",
")\n", ")\n",
"#\n", "\n",
"dataset2 = column_transformer.fit_transform(dataset)\n", "dataset2 = column_transformer.fit_transform(dataset)\n",
"print(dataset2)" "print(dataset2)"
] ]
@@ -205,7 +256,7 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "base",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@@ -219,7 +270,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.13.3" "version": "3.13.5"
} }
}, },
"nbformat": 4, "nbformat": 4,