mismades were taken

2025-08-18 14:14:34 +02:00
parent 3975cdf7e8
commit 28df88c0bf
38 changed files with 70 additions and 878 deletions
--- a/notebook.ipynb
+++ b/notebook.ipynb
@@ -23,36 +23,7 @@
     "is_executing": true
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "   appid              name release_date  required_age  price  dlc_count  \\\n",
-      "0    730  Counter-Strike 2   2012-08-21             0    0.0          1   \n",
-      "\n",
-      "                                detailed_description  \\\n",
-      "0  For over two decades, Counter-Strike has offer...   \n",
-      "\n",
-      "                                      about_the_game  \\\n",
-      "0  For over two decades, Counter-Strike has offer...   \n",
-      "\n",
-      "                                   short_description reviews  ...  \\\n",
-      "0  For over two decades, Counter-Strike has offer...     NaN  ...   \n",
-      "\n",
-      "  average_playtime_2weeks median_playtime_forever median_playtime_2weeks  \\\n",
-      "0                     879                    5174                    350   \n",
-      "\n",
-      "  discount  peak_ccu                                               tags  \\\n",
-      "0        0   1212356  {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...   \n",
-      "\n",
-      "   pct_pos_total  num_reviews_total pct_pos_recent  num_reviews_recent  \n",
-      "0             86            8632939             82               96473  \n",
-      "\n",
-      "[1 rows x 47 columns]\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
@@ -120,27 +91,7 @@
     "is_executing": true
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                                                desc  \\\n",
-      "0  For over two decades, Counter-Strike has offer...   \n",
-      "1  LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...   \n",
-      "2  The most-played game on Steam. Every day, mill...   \n",
-      "3  When a young street hustler, a retired bank ro...   \n",
-      "4  Edition Comparison Ultimate Edition The Tom Cl...   \n",
-      "\n",
-      "                                              genres  \n",
-      "0                         ['Action', 'Free To Play']  \n",
-      "1  ['Action', 'Adventure', 'Massively Multiplayer...  \n",
-      "2             ['Action', 'Strategy', 'Free To Play']  \n",
-      "3                            ['Action', 'Adventure']  \n",
-      "4                                         ['Action']  \n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import FunctionTransformer\n",
@@ -200,20 +151,7 @@
   "execution_count": null,
   "id": "ebc5a24e9bc87fdd",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0                               [Action, Free To Play]\n",
-      "1    [Action, Adventure, Massively Multiplayer, Fre...\n",
-      "2                     [Action, Strategy, Free To Play]\n",
-      "3                                  [Action, Adventure]\n",
-      "4                                             [Action]\n",
-      "Name: genres, dtype: object\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "import ast\n",
    "\n",
@@ -236,27 +174,7 @@
   "execution_count": null,
   "id": "d2c3527a5fc876bf",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "   Action  Adventure  Casual  Early Access  Free To Play  Gore  Indie  \\\n",
-      "0       1          0       0             0             1     0      0   \n",
-      "1       1          1       0             0             1     0      0   \n",
-      "2       1          0       0             0             1     0      0   \n",
-      "3       1          1       0             0             0     0      0   \n",
-      "4       1          0       0             0             0     0      0   \n",
-      "\n",
-      "   Massively Multiplayer  RPG  Racing  Simulation  Sports  Strategy  Violent  \n",
-      "0                      0    0       0           0       0         0        0  \n",
-      "1                      1    0       0           0       0         0        0  \n",
-      "2                      0    0       0           0       0         1        0  \n",
-      "3                      0    0       0           0       0         0        0  \n",
-      "4                      0    0       0           0       0         0        0  \n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "\n",
@@ -288,29 +206,7 @@
   "execution_count": null,
   "id": "4e8b407c",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "    00  000  000km    000th  00am  00f  00i  00p  00v   01  ...  이터널  이터널리턴  \\\n",
-      "0  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
-      "1  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
-      "2  0.0  0.0    0.0  0.14649   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
-      "3  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
-      "4  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
-      "\n",
-      "   이현준  정대찬  중입니다   철권  토탈워  페르소나  한국어  한글을  \n",
-      "0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
-      "1  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
-      "2  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
-      "3  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
-      "4  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
-      "\n",
-      "[5 rows x 29351 columns]\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
@@ -356,15 +252,7 @@
   "execution_count": null,
   "id": "4919bf1b37d171a7",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "13\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "mask = y.sum(axis=1).map(lambda x: x > 0)\n",
    "print((mask == False).sum()) # count of unpredictable datapoints\n",
@@ -399,12 +287,38 @@
    "X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "84f56229",
+   "metadata": {},
+   "source": [
+    "Now that all data is prepared, we need to choose a Classification Model that meets our stanadrds."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "917ba82f",
+   "metadata": {},
+   "source": [
+    "# Excursion: Choosing a classification Model\n",
+    "``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n",
+    "As such, we tested many different models on the 2k Dataset and chose the 5 best performing ones for the big dataset.\n",
+    "\n",
+    "### The comparison\n",
+    "We won't put the comparison script in this notebook, but you can find it in the ``compare_models.py`` file and try it out yourself.\n",
+    "There were some rules as a baseline for comparison:\n",
+    "- All Hyperparameters are set to default\n",
+    "- All iteration limits are set to 3000\n",
+    "\n",
+    "![Comparison Image](./compare_models_2k.png)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "12b5283d",
   "metadata": {},
   "source": [
-    "# Model Selection\n",
+    "## Model Selection\n",
    "**TODO Deciding which model to use for this task**\n",
    "\n",
    "As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LogisticRegression``"
@@ -442,36 +356,7 @@
   "execution_count": null,
   "id": "e2ebea6945193e07",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0       0.78      0.91      0.84       300\n",
-      "           1       0.78      0.62      0.69       216\n",
-      "           2       1.00      0.03      0.07        86\n",
-      "           3       0.00      0.00      0.00        46\n",
-      "           4       1.00      0.04      0.07        83\n",
-      "           5       0.00      0.00      0.00         0\n",
-      "           6       0.79      0.81      0.80       245\n",
-      "           7       0.00      0.00      0.00        42\n",
-      "           8       0.90      0.34      0.49       127\n",
-      "           9       0.00      0.00      0.00        12\n",
-      "          10       0.89      0.25      0.39       127\n",
-      "          11       0.00      0.00      0.00        14\n",
-      "          12       0.88      0.14      0.24       106\n",
-      "          13       0.00      0.00      0.00         0\n",
-      "\n",
-      "   micro avg       0.79      0.50      0.61      1404\n",
-      "   macro avg       0.50      0.22      0.26      1404\n",
-      "weighted avg       0.77      0.50      0.53      1404\n",
-      " samples avg       0.77      0.56      0.60      1404\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "\n",