From 1885f863711aef03bbc9c35a29e965f7dacdcaa3 Mon Sep 17 00:00:00 2001
From: FlorianSpeicher <flo200538@gmail.com>
Date: Mon, 11 Aug 2025 23:45:16 +0200
Subject: [PATCH] Add test script and minor Notebook changes with Tim

---
 notebook.ipynb | 45 +++++++++++++++++----------
 test_script.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 17 deletions(-)
 create mode 100644 test_script.py

diff --git a/notebook.ipynb b/notebook.ipynb
index c44ab63..44be09a 100644
--- a/notebook.ipynb
+++ b/notebook.ipynb
@@ -10,12 +10,15 @@
     "The goal of this project is getting the genre(s) of a game trough its given metadata\n",
     "\n",
     "## Dataset\n",
-    "For our project we use a Steam DataSet from kaggle. You can find it under the following URL: [Kaggle.com](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data)"
+    "For our project we use a Steam dataSet from kaggle. You can find it under the following URL: [Kaggle.com](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data)\n",
+    "\n",
+    "### Importing the dataSet\n",
+    "The dataSet is imported and added as a variable."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "3116b75f",
    "metadata": {},
    "outputs": [
@@ -104,6 +107,7 @@
     "- AppId\n",
     "- Name of the Game\n",
     "- Realease Date\n",
+    "- Reviews\n",
     "- Header Image\n",
     "- Website\n",
     "- Support URL\n",
@@ -118,7 +122,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "06dedcdf",
    "metadata": {},
    "outputs": [
@@ -195,7 +199,9 @@
    ],
    "source": [
     "# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n",
-    "dataset.drop(['appid', 'name', 'release_date', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n",
+    "dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email',\n",
+    "              'metacritic_url', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'],\n",
+    "              axis=1, inplace=True)\n",
     "print(dataset.head())"
    ]
   },
@@ -215,19 +221,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "id": "4e8b407c",
    "metadata": {},
    "outputs": [
     {
-     "ename": "AttributeError",
-     "evalue": "'numpy.ndarray' object has no attribute 'head'",
+     "ename": "ValueError",
+     "evalue": "all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 3 has size 9999",
      "output_type": "error",
      "traceback": [
       "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mAttributeError\u001b[39m                            Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 12\u001b[39m\n\u001b[32m      4\u001b[39m column_transformer = make_column_transformer(\n\u001b[32m      5\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mdetailed_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      6\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mabout_the_game\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      7\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mshort_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      8\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mreviews\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      9\u001b[39m )\n\u001b[32m     11\u001b[39m dataset = column_transformer.fit_transform(dataset)\n\u001b[32m---> \u001b[39m\u001b[32m12\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhead\u001b[49m())\n",
-      "\u001b[31mAttributeError\u001b[39m: 'numpy.ndarray' object has no attribute 'head'"
+      "\u001b[31mValueError\u001b[39m                                Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 11\u001b[39m\n\u001b[32m      3\u001b[39m \u001b[38;5;66;03m# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\u001b[39;00m\n\u001b[32m      4\u001b[39m column_transformer = make_column_transformer(\n\u001b[32m      5\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mdetailed_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      6\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mabout_the_game\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      7\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mshort_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      8\u001b[39m     (\u001b[33m'\u001b[39m\u001b[33mpassthrough\u001b[39m\u001b[33m'\u001b[39m, [\u001b[33m'\u001b[39m\u001b[33mrequired_age\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mprice\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mdlc_count\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mreviews\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mwindows\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mmac\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mlinux\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mmetacritic_score\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33machievements\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mrecommendations\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mnotes\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33msupported_languages\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mfull_audio_languages\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mcategories\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mgenres\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33muser_score\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mscore_rank\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mpositive\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mnegative\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33maverage_playtime_forever\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33maverage_playtime_2weeks\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mmedian_playtime_forever\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mmedian_playtime_2weeks\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mdiscount\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mpeak_ccu\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mtags\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mpct_pos_total\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mnum_reviews_total\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mpct_pos_recent\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mnum_reviews_recent\u001b[39m\u001b[33m'\u001b[39m])\n\u001b[32m      9\u001b[39m )\n\u001b[32m---> \u001b[39m\u001b[32m11\u001b[39m dataset = \u001b[43mcolumn_transformer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     12\u001b[39m \u001b[38;5;28mprint\u001b[39m(dataset.head())\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:319\u001b[39m, in \u001b[36m_wrap_method_output.<locals>.wrapped\u001b[39m\u001b[34m(self, X, *args, **kwargs)\u001b[39m\n\u001b[32m    317\u001b[39m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[32m    318\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, *args, **kwargs):\n\u001b[32m--> \u001b[39m\u001b[32m319\u001b[39m     data_to_wrap = \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    320\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[32m    321\u001b[39m         \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[32m    322\u001b[39m         return_tuple = (\n\u001b[32m    323\u001b[39m             _wrap_data_with_container(method, data_to_wrap[\u001b[32m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[32m    324\u001b[39m             *data_to_wrap[\u001b[32m1\u001b[39m:],\n\u001b[32m    325\u001b[39m         )\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:1389\u001b[39m, in \u001b[36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(estimator, *args, **kwargs)\u001b[39m\n\u001b[32m   1382\u001b[39m     estimator._validate_params()\n\u001b[32m   1384\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[32m   1385\u001b[39m     skip_parameter_validation=(\n\u001b[32m   1386\u001b[39m         prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[32m   1387\u001b[39m     )\n\u001b[32m   1388\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1389\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:1031\u001b[39m, in \u001b[36mColumnTransformer.fit_transform\u001b[39m\u001b[34m(self, X, y, **params)\u001b[39m\n\u001b[32m   1028\u001b[39m \u001b[38;5;28mself\u001b[39m._validate_output(Xs)\n\u001b[32m   1029\u001b[39m \u001b[38;5;28mself\u001b[39m._record_output_indices(Xs)\n\u001b[32m-> \u001b[39m\u001b[32m1031\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_hstack\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mXs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_samples\u001b[49m\u001b[43m=\u001b[49m\u001b[43mn_samples\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:1225\u001b[39m, in \u001b[36mColumnTransformer._hstack\u001b[39m\u001b[34m(self, Xs, n_samples)\u001b[39m\n\u001b[32m   1215\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m   1216\u001b[39m             \u001b[33m\"\u001b[39m\u001b[33mConcatenating DataFrames from the transformer\u001b[39m\u001b[33m'\u001b[39m\u001b[33ms output lead to\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   1217\u001b[39m             \u001b[33m\"\u001b[39m\u001b[33m an inconsistent number of samples. The output may have Pandas\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   (...)\u001b[39m\u001b[32m   1220\u001b[39m             \u001b[33m\"\u001b[39m\u001b[33m samples.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   1221\u001b[39m         )\n\u001b[32m   1223\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m output\n\u001b[32m-> \u001b[39m\u001b[32m1225\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mnp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhstack\u001b[49m\u001b[43m(\u001b[49m\u001b[43mXs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\numpy\\_core\\shape_base.py:364\u001b[39m, in \u001b[36mhstack\u001b[39m\u001b[34m(tup, dtype, casting)\u001b[39m\n\u001b[32m    362\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m _nx.concatenate(arrs, \u001b[32m0\u001b[39m, dtype=dtype, casting=casting)\n\u001b[32m    363\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m364\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_nx\u001b[49m\u001b[43m.\u001b[49m\u001b[43mconcatenate\u001b[49m\u001b[43m(\u001b[49m\u001b[43marrs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcasting\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcasting\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[31mValueError\u001b[39m: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 3 has size 9999"
      ]
     }
    ],
@@ -239,7 +250,7 @@
     "    (TfidfVectorizer(stop_words='english'), ['detailed_description']),\n",
     "    (TfidfVectorizer(stop_words='english'), ['about_the_game']),\n",
     "    (TfidfVectorizer(stop_words='english'), ['short_description']),\n",
-    "    (TfidfVectorizer(stop_words='english'), ['reviews']),\n",
+    "    ('passthrough', ['required_age','price','dlc_count','reviews','windows','mac','linux','metacritic_score','achievements','recommendations','notes','supported_languages','full_audio_languages','categories','genres','user_score','score_rank','positive','negative','average_playtime_forever','average_playtime_2weeks','median_playtime_forever','median_playtime_2weeks','discount','peak_ccu','tags','pct_pos_total','num_reviews_total','pct_pos_recent','num_reviews_recent'])\n",
     ")\n",
     "\n",
     "dataset = column_transformer.fit_transform(dataset)\n",
@@ -262,8 +273,8 @@
    "id": "6a2a3d4f",
    "metadata": {},
    "source": [
-    "### Setting missing values\n",
-    "**TODO: Removing or Setting values that are not set or NaN in the DataSet**"
+    "### Handling missing values\n",
+    "Removing NaN values in the dataSet and setting missing numerical feature values to the mean feature count. Missing Text values are set to a default String `Unknown`."
    ]
   },
   {
@@ -287,12 +298,12 @@
    "metadata": {},
    "source": [
     "# Data Split\n",
-    "**TODO splitting the Data into Train, test and validation data**"
+    "Splitting our dataSet to training and testing data. The relation is 80% training and 20% testing data."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "cfbf3787",
    "metadata": {},
    "outputs": [
@@ -307,7 +318,7 @@
    "source": [
     "from sklearn.model_selection import train_test_split\n",
     "\n",
-    "# Annahme: 'genres' ist das Ziel/Label\n",
+    "# Setting the target feature 'genres' and dropping it from the dataset\n",
     "X = dataset.drop('genres', axis=1)\n",
     "y = dataset['genres']\n",
     "\n",
@@ -315,7 +326,7 @@
     "    X, y, test_size=0.2, random_state=42\n",
     ")\n",
     "\n",
-    "print(f\"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}\")"
+    "print(f\"Training: {X_train.shape}, Testing: {X_test.shape}\")"
    ]
   },
   {
diff --git a/test_script.py b/test_script.py
new file mode 100644
index 0000000..0acb908
--- /dev/null
+++ b/test_script.py
@@ -0,0 +1,84 @@
+import numpy as np
+import pandas as pd
+
+
+#### INITIALIZE
+
+# load data
+# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
+dataset = pd.read_csv("./games_march2025_cleaned_10k.csv",sep=",")
+print(dataset.head())
+
+
+
+
+#### DROP UNIQUES
+
+# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
+dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email',
+              'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'],
+              axis=1, inplace=True)
+print(dataset.head())
+
+
+
+
+
+
+#### SET MISSING VALUES
+
+
+# Setting missing numeric values to the mean
+dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
+# Setting missing text values to 'Unknown'
+dataset.fillna('Unknown', inplace=True)
+# Setting missing values in other columns to NaN
+dataset.dropna(inplace=True)
+
+
+
+
+
+#### STRUCTURIZE AND STANDARDIZE
+
+from sklearn.compose import make_column_transformer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
+
+# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
+column_transformer = make_column_transformer(
+    (TfidfVectorizer(stop_words='english'), ['detailed_description']),
+    (TfidfVectorizer(stop_words='english'), ['about_the_game']),
+    (TfidfVectorizer(stop_words='english'), ['short_description']),
+    (OneHotEncoder(), ['windows', 'mac', 'linux']),
+    (StandardScaler(), ['price']),
+    (FunctionTransformer(lambda x: x/100.0), ['metacritic_score']),
+    (StandardScaler(), ['achievements']),
+    (StandardScaler(), ['recommendations']),
+    #TODO: custom onehot encoder for these:
+    ('passthrough', ['supported_languages','full_audio_languages','categories','genres','tags']),
+    ('passthrough', ['required_age', 'dlc_count','user_score','score_rank','positive','negative','average_playtime_forever','average_playtime_2weeks','median_playtime_forever','median_playtime_2weeks','discount','peak_ccu','pct_pos_total','num_reviews_total','pct_pos_recent','num_reviews_recent'])
+)
+
+dataset = column_transformer.fit_transform(dataset)
+print(dataset.head())
+
+
+
+
+
+
+#####
+
+
+from sklearn.model_selection import train_test_split
+
+# Annahme: 'genres' ist das Ziel/Label
+X = dataset.drop('genres', axis=1)
+y = dataset['genres']
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42
+)
+
+print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")
\ No newline at end of file