From 4842b55e4f4fcd7447ca62b3e2582420f5aa252e Mon Sep 17 00:00:00 2001
From: FlorianSpeicher <flo200538@gmail.com>
Date: Mon, 11 Aug 2025 23:01:20 +0200
Subject: [PATCH] Add Setting missing values and Data Split

---
 notebook.ipynb | 67 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 10 deletions(-)

diff --git a/notebook.ipynb b/notebook.ipynb
index e70e62c..c44ab63 100644
--- a/notebook.ipynb
+++ b/notebook.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 4,
    "id": "3116b75f",
    "metadata": {},
    "outputs": [
@@ -89,7 +89,7 @@
     "\n",
     "# load data\n",
     "# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n",
-    "dataset = pd.read_csv(\"./games_march2025_cleaned.csv\",sep=\",\")\n",
+    "dataset = pd.read_csv(\"./games_march2025_cleaned_10k.csv\",sep=\",\")\n",
     "print(dataset.head())"
    ]
   },
@@ -118,7 +118,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 5,
    "id": "06dedcdf",
    "metadata": {},
    "outputs": [
@@ -215,15 +215,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "4e8b407c",
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[1. 1. 1. 1.]]\n"
+     "ename": "AttributeError",
+     "evalue": "'numpy.ndarray' object has no attribute 'head'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mAttributeError\u001b[39m                            Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 12\u001b[39m\n\u001b[32m      4\u001b[39m column_transformer = make_column_transformer(\n\u001b[32m      5\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mdetailed_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      6\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mabout_the_game\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      7\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mshort_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      8\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mreviews\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      9\u001b[39m )\n\u001b[32m     11\u001b[39m dataset = column_transformer.fit_transform(dataset)\n\u001b[32m---> \u001b[39m\u001b[32m12\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhead\u001b[49m())\n",
+      "\u001b[31mAttributeError\u001b[39m: 'numpy.ndarray' object has no attribute 'head'"
      ]
     }
    ],
@@ -238,8 +242,8 @@
     "    (TfidfVectorizer(stop_words='english'), ['reviews']),\n",
     ")\n",
     "\n",
-    "dataset2 = column_transformer.fit_transform(dataset)\n",
-    "print(dataset2)"
+    "dataset = column_transformer.fit_transform(dataset)\n",
+    "print(dataset.head())"
    ]
   },
   {
@@ -262,6 +266,21 @@
     "**TODO: Removing or Setting values that are not set or NaN in the DataSet**"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "dea7dc00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setting missing numeric values to the mean\n",
+    "dataset.fillna(dataset.mean(numeric_only=True), inplace=True)\n",
+    "# Setting missing text values to 'Unknown'\n",
+    "dataset.fillna('Unknown', inplace=True)\n",
+    "# Setting missing values in other columns to NaN\n",
+    "dataset.dropna(inplace=True)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "091d7e13",
@@ -271,6 +290,34 @@
     "**TODO splitting the Data into Train, test and validation data**"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "cfbf3787",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trainingsdaten: (7999, 33), Testdaten: (2000, 33)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# Annahme: 'genres' ist das Ziel/Label\n",
+    "X = dataset.drop('genres', axis=1)\n",
+    "y = dataset['genres']\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, y, test_size=0.2, random_state=42\n",
+    ")\n",
+    "\n",
+    "print(f\"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "12b5283d",