diff --git a/games_march2025_cleaned_10k/LinearDiscriminantAnalysis.txt b/games_march2025_cleaned_10k/LinearDiscriminantAnalysis.txt new file mode 100644 index 0000000..e27991d --- /dev/null +++ b/games_march2025_cleaned_10k/LinearDiscriminantAnalysis.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.74 0.74 0.74 1109 + 1 0.67 0.63 0.65 1107 + 2 0.58 0.41 0.48 686 + 3 0.09 0.53 0.15 192 + 4 0.27 0.30 0.29 369 + 5 0.00 0.00 0.00 2 + 6 0.77 0.84 0.81 1576 + 7 0.06 0.44 0.10 135 + 8 0.58 0.45 0.50 707 + 9 0.92 0.63 0.75 91 + 10 0.74 0.54 0.63 682 + 11 0.12 0.44 0.19 112 + 12 0.70 0.52 0.60 562 + 13 0.00 0.00 0.00 5 + + micro avg 0.51 0.61 0.55 7335 + macro avg 0.45 0.46 0.42 7335 +weighted avg 0.64 0.61 0.61 7335 + samples avg 0.54 0.65 0.55 7335 diff --git a/games_march2025_cleaned_10k/MLPClassifier.txt b/games_march2025_cleaned_10k/MLPClassifier.txt new file mode 100644 index 0000000..778e24f --- /dev/null +++ b/games_march2025_cleaned_10k/MLPClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.82 0.73 0.77 1109 + 1 0.73 0.69 0.71 1107 + 2 0.71 0.43 0.53 686 + 3 0.73 0.06 0.11 192 + 4 0.73 0.22 0.34 369 + 5 0.00 0.00 0.00 2 + 6 0.78 0.93 0.85 1576 + 7 0.85 0.21 0.34 135 + 8 0.79 0.55 0.65 707 + 9 0.98 0.57 0.72 91 + 10 0.88 0.47 0.61 682 + 11 0.93 0.46 0.61 112 + 12 0.81 0.57 0.67 562 + 13 0.00 0.00 0.00 5 + + micro avg 0.78 0.62 0.69 7335 + macro avg 0.69 0.42 0.49 7335 +weighted avg 0.78 0.62 0.67 7335 + samples avg 0.79 0.68 0.69 7335 diff --git a/notebook.ipynb b/notebook.ipynb index 00fc10c..aede9bb 100644 --- a/notebook.ipynb +++ b/notebook.ipynb @@ -16,14 +16,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "3116b75f", "metadata": { "jupyter": { "is_executing": true } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " appid name release_date required_age price dlc_count \\\n", + "0 730 Counter-Strike 2 2012-08-21 0 0.0 1 \n", + "\n", + " detailed_description \\\n", + "0 For over two decades, Counter-Strike has offer... \n", + "\n", + " about_the_game \\\n", + "0 For over two decades, Counter-Strike has offer... \n", + "\n", + " short_description reviews ... \\\n", + "0 For over two decades, Counter-Strike has offer... NaN ... \n", + "\n", + " average_playtime_2weeks median_playtime_forever median_playtime_2weeks \\\n", + "0 879 5174 350 \n", + "\n", + " discount peak_ccu tags \\\n", + "0 0 1212356 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... \n", + "\n", + " pct_pos_total num_reviews_total pct_pos_recent num_reviews_recent \n", + "0 86 8632939 82 96473 \n", + "\n", + "[1 rows x 47 columns]\n" + ] + } + ], "source": [ "import numpy as np\n", "import pandas as pd\n", @@ -61,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "d159117377f3633c", "metadata": {}, "outputs": [], @@ -84,14 +113,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "986fbb31a7ae0d8b", "metadata": { "jupyter": { "is_executing": true } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " desc \\\n", + "0 For over two decades, Counter-Strike has offer... \n", + "1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n", + "2 The most-played game on Steam. Every day, mill... \n", + "3 When a young street hustler, a retired bank ro... \n", + "4 Edition Comparison Ultimate Edition The Tom Cl... \n", + "\n", + " genres \n", + "0 ['Action', 'Free To Play'] \n", + "1 ['Action', 'Adventure', 'Massively Multiplayer... \n", + "2 ['Action', 'Strategy', 'Free To Play'] \n", + "3 ['Action', 'Adventure'] \n", + "4 ['Action'] \n" + ] + } + ], "source": [ "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import FunctionTransformer\n", @@ -121,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "44239f6b7fd23cde", "metadata": {}, "outputs": [], @@ -148,10 +197,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "ebc5a24e9bc87fdd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 [Action, Free To Play]\n", + "1 [Action, Adventure, Massively Multiplayer, Fre...\n", + "2 [Action, Strategy, Free To Play]\n", + "3 [Action, Adventure]\n", + "4 [Action]\n", + "Name: genres, dtype: object\n" + ] + } + ], "source": [ "import ast\n", "\n", @@ -171,10 +233,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "d2c3527a5fc876bf", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Action Adventure Casual Early Access Free To Play Gore Indie \\\n", + "0 1 0 0 0 1 0 0 \n", + "1 1 1 0 0 1 0 0 \n", + "2 1 0 0 0 1 0 0 \n", + "3 1 1 0 0 0 0 0 \n", + "4 1 0 0 0 0 0 0 \n", + "\n", + " Massively Multiplayer RPG Racing Simulation Sports Strategy Violent \n", + "0 0 0 0 0 0 0 0 \n", + "1 1 0 0 0 0 0 0 \n", + "2 0 0 0 0 0 1 0 \n", + "3 0 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 0 \n" + ] + } + ], "source": [ "from sklearn.preprocessing import MultiLabelBinarizer\n", "\n", @@ -203,10 +285,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "4e8b407c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n", + "0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", + "\n", + " 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + "[5 rows x 29351 columns]\n" + ] + } + ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", @@ -226,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "86d9da42f4df8e49", "metadata": {}, "outputs": [], @@ -243,22 +347,62 @@ "## The Model\n", "\n", "#### Removing unpredicatble Datapoints\n", - "Some Datapoints don't have a genre assigned (all feature values in y are 0). The model we use can't handle such cases, thus they have to be removed.\n", + "\n", + "Some genres have too little datapoints to be predictable. The 10k Dataset has 12 Classes that have less than 5 Datapoints, usually only 1 oder 2. These have too big of a probability that they will fall into only the train or test data and therefore will be removed. " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e1bc73d4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before(1999, 14)\n", + "After(1999, 12)\n" + ] + } + ], + "source": [ + "# remove genres that have less than min_entries entries -> probability of broken split to big\n", + "mask = (y == 1).sum() >= 5\n", + "print(\"Before\" + str(y.shape))\n", + "y_prep = y.loc[:, mask]\n", + "print(\"After\" + str(y_prep.shape))" + ] + }, + { + "cell_type": "markdown", + "id": "2fa60e6b", + "metadata": {}, + "source": [ + "Some Datapoints don't have a genre assigned (all feature values in y are 0, either from the start or after we removed them one step before). The model we use can't handle such cases, thus they have to be removed.\n", "We filter after all values that we can use with a mask, and apply that mask to our matrices." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "4919bf1b37d171a7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13\n" + ] + } + ], "source": [ "mask = y.sum(axis=1).map(lambda x: x > 0)\n", "print((mask == False).sum()) # count of unpredictable datapoints\n", "\n", "X_clean = X[mask]\n", - "y_clean = y[mask]" + "y_clean = y_prep[mask]" ] }, { @@ -273,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "cfbf3787", "metadata": { "jupyter": { @@ -287,6 +431,62 @@ "X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)" ] }, + { + "cell_type": "markdown", + "id": "8cd4bb54", + "metadata": {}, + "source": [ + "We also do a little cleanup session before proceeding." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "0b0a46a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1905" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import gc\n", + "\n", + "# Initial dataset loading\n", + "del dataset\n", + "del column_transformer\n", + "\n", + "# preparation of y\n", + "del mlb_genres\n", + "del genres_encoded\n", + "del genres_df\n", + "\n", + "# preparation of X\n", + "del tfidf_df\n", + "del vectorizer\n", + "del tfidf_matrix\n", + "\n", + "# Initial Dataset\n", + "del X\n", + "del y\n", + "# Removing Genres with less than 5 datapoints\n", + "del y_prep\n", + "\n", + "# Sorting out dead datapoints (all target values are 0)\n", + "del X_clean\n", + "del y_clean\n", + "del mask\n", + "\n", + "gc.collect()" + ] + }, { "cell_type": "markdown", "id": "84f56229", @@ -360,7 +560,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "8c1d72c4532bd509", "metadata": {}, "outputs": [], @@ -387,10 +587,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "e2ebea6945193e07", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.78 0.91 0.84 300\n", + " 1 0.78 0.62 0.69 216\n", + " 2 1.00 0.03 0.07 86\n", + " 3 0.00 0.00 0.00 46\n", + " 4 1.00 0.04 0.07 83\n", + " 5 0.79 0.81 0.80 245\n", + " 6 0.00 0.00 0.00 42\n", + " 7 0.90 0.34 0.49 127\n", + " 8 0.00 0.00 0.00 12\n", + " 9 0.89 0.25 0.39 127\n", + " 10 0.00 0.00 0.00 14\n", + " 11 0.88 0.14 0.24 106\n", + "\n", + " micro avg 0.79 0.50 0.61 1404\n", + " macro avg 0.58 0.26 0.30 1404\n", + "weighted avg 0.77 0.50 0.53 1404\n", + " samples avg 0.77 0.56 0.60 1404\n", + "\n" + ] + } + ], "source": [ "from sklearn.metrics import classification_report\n", "\n",