Merge remote-tracking branch 'origin/main'

2025-08-20 20:11:38 +02:00
parent d1049e233e 34d2a9825f
commit fd7bd21fc0
3 changed files with 290 additions and 21 deletions
--- a/games_march2025_cleaned_10k/LinearDiscriminantAnalysis.txt
+++ b/games_march2025_cleaned_10k/LinearDiscriminantAnalysis.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.74      0.74      0.74      1109
           1       0.67      0.63      0.65      1107
           2       0.58      0.41      0.48       686
           3       0.09      0.53      0.15       192
           4       0.27      0.30      0.29       369
           5       0.00      0.00      0.00         2
           6       0.77      0.84      0.81      1576
           7       0.06      0.44      0.10       135
           8       0.58      0.45      0.50       707
           9       0.92      0.63      0.75        91
          10       0.74      0.54      0.63       682
          11       0.12      0.44      0.19       112
          12       0.70      0.52      0.60       562
          13       0.00      0.00      0.00         5
   micro avg       0.51      0.61      0.55      7335
   macro avg       0.45      0.46      0.42      7335
 weighted avg       0.64      0.61      0.61      7335
 samples avg       0.54      0.65      0.55      7335
--- a/games_march2025_cleaned_10k/MLPClassifier.txt
+++ b/games_march2025_cleaned_10k/MLPClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.82      0.73      0.77      1109
           1       0.73      0.69      0.71      1107
           2       0.71      0.43      0.53       686
           3       0.73      0.06      0.11       192
           4       0.73      0.22      0.34       369
           5       0.00      0.00      0.00         2
           6       0.78      0.93      0.85      1576
           7       0.85      0.21      0.34       135
           8       0.79      0.55      0.65       707
           9       0.98      0.57      0.72        91
          10       0.88      0.47      0.61       682
          11       0.93      0.46      0.61       112
          12       0.81      0.57      0.67       562
          13       0.00      0.00      0.00         5
   micro avg       0.78      0.62      0.69      7335
   macro avg       0.69      0.42      0.49      7335
 weighted avg       0.78      0.62      0.67      7335
 samples avg       0.79      0.68      0.69      7335
--- a/notebook.ipynb
+++ b/notebook.ipynb
@@ -16,14 +16,43 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "id": "3116b75f",
   "metadata": {
    "jupyter": {
     "is_executing": true
    }
   },
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   appid              name release_date  required_age  price  dlc_count  \\\n",
      "0    730  Counter-Strike 2   2012-08-21             0    0.0          1   \n",
      "\n",
      "                                detailed_description  \\\n",
      "0  For over two decades, Counter-Strike has offer...   \n",
      "\n",
      "                                      about_the_game  \\\n",
      "0  For over two decades, Counter-Strike has offer...   \n",
      "\n",
      "                                   short_description reviews  ...  \\\n",
      "0  For over two decades, Counter-Strike has offer...     NaN  ...   \n",
      "\n",
      "  average_playtime_2weeks median_playtime_forever median_playtime_2weeks  \\\n",
      "0                     879                    5174                    350   \n",
      "\n",
      "  discount  peak_ccu                                               tags  \\\n",
      "0        0   1212356  {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...   \n",
      "\n",
      "   pct_pos_total  num_reviews_total pct_pos_recent  num_reviews_recent  \n",
      "0             86            8632939             82               96473  \n",
      "\n",
      "[1 rows x 47 columns]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
@@ -61,7 +90,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
   "id": "d159117377f3633c",
   "metadata": {},
   "outputs": [],
@@ -84,14 +113,34 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
   "id": "986fbb31a7ae0d8b",
   "metadata": {
    "jupyter": {
     "is_executing": true
    }
   },
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                desc  \\\n",
      "0  For over two decades, Counter-Strike has offer...   \n",
      "1  LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...   \n",
      "2  The most-played game on Steam. Every day, mill...   \n",
      "3  When a young street hustler, a retired bank ro...   \n",
      "4  Edition Comparison Ultimate Edition The Tom Cl...   \n",
      "\n",
      "                                              genres  \n",
      "0                         ['Action', 'Free To Play']  \n",
      "1  ['Action', 'Adventure', 'Massively Multiplayer...  \n",
      "2             ['Action', 'Strategy', 'Free To Play']  \n",
      "3                            ['Action', 'Adventure']  \n",
      "4                                         ['Action']  \n"
     ]
    }
   ],
   "source": [
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import FunctionTransformer\n",
@@ -121,7 +170,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
   "id": "44239f6b7fd23cde",
   "metadata": {},
   "outputs": [],
@@ -148,10 +197,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
   "id": "ebc5a24e9bc87fdd",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0                               [Action, Free To Play]\n",
      "1    [Action, Adventure, Massively Multiplayer, Fre...\n",
      "2                     [Action, Strategy, Free To Play]\n",
      "3                                  [Action, Adventure]\n",
      "4                                             [Action]\n",
      "Name: genres, dtype: object\n"
     ]
    }
   ],
   "source": [
    "import ast\n",
    "\n",
@@ -171,10 +233,30 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
   "id": "d2c3527a5fc876bf",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Action  Adventure  Casual  Early Access  Free To Play  Gore  Indie  \\\n",
      "0       1          0       0             0             1     0      0   \n",
      "1       1          1       0             0             1     0      0   \n",
      "2       1          0       0             0             1     0      0   \n",
      "3       1          1       0             0             0     0      0   \n",
      "4       1          0       0             0             0     0      0   \n",
      "\n",
      "   Massively Multiplayer  RPG  Racing  Simulation  Sports  Strategy  Violent  \n",
      "0                      0    0       0           0       0         0        0  \n",
      "1                      1    0       0           0       0         0        0  \n",
      "2                      0    0       0           0       0         1        0  \n",
      "3                      0    0       0           0       0         0        0  \n",
      "4                      0    0       0           0       0         0        0  \n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "\n",
@@ -203,10 +285,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
   "id": "4e8b407c",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    00  000  000km    000th  00am  00f  00i  00p  00v   01  ...  이터널  이터널리턴  \\\n",
      "0  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
      "1  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
      "2  0.0  0.0    0.0  0.14649   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
      "3  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
      "4  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
      "\n",
      "   이현준  정대찬  중입니다   철권  토탈워  페르소나  한국어  한글을  \n",
      "0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
      "1  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
      "2  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
      "3  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
      "4  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
      "\n",
      "[5 rows x 29351 columns]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
@@ -226,7 +330,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
   "id": "86d9da42f4df8e49",
   "metadata": {},
   "outputs": [],
@@ -243,22 +347,62 @@
    "## The Model\n",
    "\n",
    "####  Removing unpredicatble Datapoints\n",
-    "Some Datapoints don't have a genre assigned (all feature values in y are 0). The model we use can't handle such cases, thus they have to be removed.\n",
+    "\n",
    "Some genres have too little datapoints to be predictable. The 10k Dataset has 12 Classes that have less than 5 Datapoints, usually only 1 oder 2. These have too big of a probability that they will fall into only the train or test data and therefore will be removed.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "e1bc73d4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Before(1999, 14)\n",
      "After(1999, 12)\n"
     ]
    }
   ],
   "source": [
    "# remove genres that have less than min_entries entries -> probability of broken split to big\n",
    "mask = (y == 1).sum() >= 5\n",
    "print(\"Before\" + str(y.shape))\n",
    "y_prep = y.loc[:, mask]\n",
    "print(\"After\" + str(y_prep.shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2fa60e6b",
   "metadata": {},
   "source": [
    "Some Datapoints don't have a genre assigned (all feature values in y are 0, either from the start or after we removed them one step before). The model we use can't handle such cases, thus they have to be removed.\n",
    "We filter after all values that we can use with a mask, and apply that mask to our matrices."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
   "id": "4919bf1b37d171a7",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13\n"
     ]
    }
   ],
   "source": [
    "mask = y.sum(axis=1).map(lambda x: x > 0)\n",
    "print((mask == False).sum()) # count of unpredictable datapoints\n",
    "\n",
    "X_clean = X[mask]\n",
-    "y_clean = y[mask]"
+    "y_clean = y_prep[mask]"
   ]
  },
  {
@@ -273,7 +417,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
   "id": "cfbf3787",
   "metadata": {
    "jupyter": {
@@ -287,6 +431,62 @@
    "X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8cd4bb54",
   "metadata": {},
   "source": [
    "We also do a little cleanup session before proceeding."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "0b0a46a4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1905"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import gc\n",
    "\n",
    "# Initial dataset loading\n",
    "del dataset\n",
    "del column_transformer\n",
    "\n",
    "# preparation of y\n",
    "del mlb_genres\n",
    "del genres_encoded\n",
    "del genres_df\n",
    "\n",
    "# preparation of X\n",
    "del tfidf_df\n",
    "del vectorizer\n",
    "del tfidf_matrix\n",
    "\n",
    "# Initial Dataset\n",
    "del X\n",
    "del y\n",
    "# Removing Genres with less than 5 datapoints\n",
    "del y_prep\n",
    "\n",
    "# Sorting out dead datapoints (all target values are 0)\n",
    "del X_clean\n",
    "del y_clean\n",
    "del mask\n",
    "\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "84f56229",
@@ -360,7 +560,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
   "id": "8c1d72c4532bd509",
   "metadata": {},
   "outputs": [],
@@ -387,10 +587,37 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 32,
   "id": "e2ebea6945193e07",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.78      0.91      0.84       300\n",
      "           1       0.78      0.62      0.69       216\n",
      "           2       1.00      0.03      0.07        86\n",
      "           3       0.00      0.00      0.00        46\n",
      "           4       1.00      0.04      0.07        83\n",
      "           5       0.79      0.81      0.80       245\n",
      "           6       0.00      0.00      0.00        42\n",
      "           7       0.90      0.34      0.49       127\n",
      "           8       0.00      0.00      0.00        12\n",
      "           9       0.89      0.25      0.39       127\n",
      "          10       0.00      0.00      0.00        14\n",
      "          11       0.88      0.14      0.24       106\n",
      "\n",
      "   micro avg       0.79      0.50      0.61      1404\n",
      "   macro avg       0.58      0.26      0.30      1404\n",
      "weighted avg       0.77      0.50      0.53      1404\n",
      " samples avg       0.77      0.56      0.60      1404\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "\n",