Add Setting missing values and Data Split

2025-08-11 23:01:20 +02:00
parent 3c055a1ce0
commit 4842b55e4f
1 changed files with 57 additions and 10 deletions
--- a/notebook.ipynb
+++ b/notebook.ipynb
@@ -15,7 +15,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 4,
   "id": "3116b75f",
   "metadata": {},
   "outputs": [
@@ -89,7 +89,7 @@
    "\n",
    "# load data\n",
    "# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n",
-    "dataset = pd.read_csv(\"./games_march2025_cleaned.csv\",sep=\",\")\n",
+    "dataset = pd.read_csv(\"./games_march2025_cleaned_10k.csv\",sep=\",\")\n",
    "print(dataset.head())"
   ]
  },
@@ -118,7 +118,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 5,
   "id": "06dedcdf",
   "metadata": {},
   "outputs": [
@@ -215,15 +215,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "id": "4e8b407c",
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
+     "ename": "AttributeError",
-     "output_type": "stream",
+     "evalue": "'numpy.ndarray' object has no attribute 'head'",
-     "text": [
+     "output_type": "error",
-      "[[1. 1. 1. 1.]]\n"
+     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mAttributeError\u001b[39m                            Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 12\u001b[39m\n\u001b[32m      4\u001b[39m column_transformer = make_column_transformer(\n\u001b[32m      5\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mdetailed_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      6\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mabout_the_game\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      7\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mshort_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      8\u001b[39m     (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mreviews\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m      9\u001b[39m )\n\u001b[32m     11\u001b[39m dataset = column_transformer.fit_transform(dataset)\n\u001b[32m---> \u001b[39m\u001b[32m12\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhead\u001b[49m())\n",
      "\u001b[31mAttributeError\u001b[39m: 'numpy.ndarray' object has no attribute 'head'"
     ]
    }
   ],
@@ -238,8 +242,8 @@
    "    (TfidfVectorizer(stop_words='english'), ['reviews']),\n",
    ")\n",
    "\n",
-    "dataset2 = column_transformer.fit_transform(dataset)\n",
+    "dataset = column_transformer.fit_transform(dataset)\n",
-    "print(dataset2)"
+    "print(dataset.head())"
   ]
  },
  {
@@ -262,6 +266,21 @@
    "**TODO: Removing or Setting values that are not set or NaN in the DataSet**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "dea7dc00",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setting missing numeric values to the mean\n",
    "dataset.fillna(dataset.mean(numeric_only=True), inplace=True)\n",
    "# Setting missing text values to 'Unknown'\n",
    "dataset.fillna('Unknown', inplace=True)\n",
    "# Setting missing values in other columns to NaN\n",
    "dataset.dropna(inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "091d7e13",
@@ -271,6 +290,34 @@
    "**TODO splitting the Data into Train, test and validation data**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "cfbf3787",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Trainingsdaten: (7999, 33), Testdaten: (2000, 33)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Annahme: 'genres' ist das Ziel/Label\n",
    "X = dataset.drop('genres', axis=1)\n",
    "y = dataset['genres']\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.2, random_state=42\n",
    ")\n",
    "\n",
    "print(f\"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12b5283d",