From 4842b55e4f4fcd7447ca62b3e2582420f5aa252e Mon Sep 17 00:00:00 2001 From: FlorianSpeicher Date: Mon, 11 Aug 2025 23:01:20 +0200 Subject: [PATCH] Add Setting missing values and Data Split --- notebook.ipynb | 67 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/notebook.ipynb b/notebook.ipynb index e70e62c..c44ab63 100644 --- a/notebook.ipynb +++ b/notebook.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, "id": "3116b75f", "metadata": {}, "outputs": [ @@ -89,7 +89,7 @@ "\n", "# load data\n", "# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n", - "dataset = pd.read_csv(\"./games_march2025_cleaned.csv\",sep=\",\")\n", + "dataset = pd.read_csv(\"./games_march2025_cleaned_10k.csv\",sep=\",\")\n", "print(dataset.head())" ] }, @@ -118,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "id": "06dedcdf", "metadata": {}, "outputs": [ @@ -215,15 +215,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "4e8b407c", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[1. 1. 1. 1.]]\n" + "ename": "AttributeError", + "evalue": "'numpy.ndarray' object has no attribute 'head'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 12\u001b[39m\n\u001b[32m 4\u001b[39m column_transformer = make_column_transformer(\n\u001b[32m 5\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mdetailed_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 6\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mabout_the_game\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 7\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mshort_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 8\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mreviews\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 9\u001b[39m )\n\u001b[32m 11\u001b[39m dataset = column_transformer.fit_transform(dataset)\n\u001b[32m---> \u001b[39m\u001b[32m12\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhead\u001b[49m())\n", + "\u001b[31mAttributeError\u001b[39m: 'numpy.ndarray' object has no attribute 'head'" ] } ], @@ -238,8 +242,8 @@ " (TfidfVectorizer(stop_words='english'), ['reviews']),\n", ")\n", "\n", - "dataset2 = column_transformer.fit_transform(dataset)\n", - "print(dataset2)" + "dataset = column_transformer.fit_transform(dataset)\n", + "print(dataset.head())" ] }, { @@ -262,6 +266,21 @@ "**TODO: Removing or Setting values that are not set or NaN in the DataSet**" ] }, + { + "cell_type": "code", + "execution_count": 6, + "id": "dea7dc00", + "metadata": {}, + "outputs": [], + "source": [ + "# Setting missing numeric values to the mean\n", + "dataset.fillna(dataset.mean(numeric_only=True), inplace=True)\n", + "# Setting missing text values to 'Unknown'\n", + "dataset.fillna('Unknown', inplace=True)\n", + "# Setting missing values in other columns to NaN\n", + "dataset.dropna(inplace=True)" + ] + }, { "cell_type": "markdown", "id": "091d7e13", @@ -271,6 +290,34 @@ "**TODO splitting the Data into Train, test and validation data**" ] }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cfbf3787", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trainingsdaten: (7999, 33), Testdaten: (2000, 33)\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Annahme: 'genres' ist das Ziel/Label\n", + "X = dataset.drop('genres', axis=1)\n", + "y = dataset['genres']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "print(f\"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}\")" + ] + }, { "cell_type": "markdown", "id": "12b5283d",