Merge remote-tracking branch 'origin/main'

This commit is contained in:
Maximilian Kany
2025-08-20 20:11:38 +02:00
3 changed files with 290 additions and 21 deletions

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.74 0.74 0.74 1109
1 0.67 0.63 0.65 1107
2 0.58 0.41 0.48 686
3 0.09 0.53 0.15 192
4 0.27 0.30 0.29 369
5 0.00 0.00 0.00 2
6 0.77 0.84 0.81 1576
7 0.06 0.44 0.10 135
8 0.58 0.45 0.50 707
9 0.92 0.63 0.75 91
10 0.74 0.54 0.63 682
11 0.12 0.44 0.19 112
12 0.70 0.52 0.60 562
13 0.00 0.00 0.00 5
micro avg 0.51 0.61 0.55 7335
macro avg 0.45 0.46 0.42 7335
weighted avg 0.64 0.61 0.61 7335
samples avg 0.54 0.65 0.55 7335

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.82 0.73 0.77 1109
1 0.73 0.69 0.71 1107
2 0.71 0.43 0.53 686
3 0.73 0.06 0.11 192
4 0.73 0.22 0.34 369
5 0.00 0.00 0.00 2
6 0.78 0.93 0.85 1576
7 0.85 0.21 0.34 135
8 0.79 0.55 0.65 707
9 0.98 0.57 0.72 91
10 0.88 0.47 0.61 682
11 0.93 0.46 0.61 112
12 0.81 0.57 0.67 562
13 0.00 0.00 0.00 5
micro avg 0.78 0.62 0.69 7335
macro avg 0.69 0.42 0.49 7335
weighted avg 0.78 0.62 0.67 7335
samples avg 0.79 0.68 0.69 7335

View File

@@ -16,14 +16,43 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 19,
"id": "3116b75f", "id": "3116b75f",
"metadata": { "metadata": {
"jupyter": { "jupyter": {
"is_executing": true "is_executing": true
} }
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" appid name release_date required_age price dlc_count \\\n",
"0 730 Counter-Strike 2 2012-08-21 0 0.0 1 \n",
"\n",
" detailed_description \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"\n",
" about_the_game \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"\n",
" short_description reviews ... \\\n",
"0 For over two decades, Counter-Strike has offer... NaN ... \n",
"\n",
" average_playtime_2weeks median_playtime_forever median_playtime_2weeks \\\n",
"0 879 5174 350 \n",
"\n",
" discount peak_ccu tags \\\n",
"0 0 1212356 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... \n",
"\n",
" pct_pos_total num_reviews_total pct_pos_recent num_reviews_recent \n",
"0 86 8632939 82 96473 \n",
"\n",
"[1 rows x 47 columns]\n"
]
}
],
"source": [ "source": [
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
@@ -61,7 +90,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 20,
"id": "d159117377f3633c", "id": "d159117377f3633c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -84,14 +113,34 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 21,
"id": "986fbb31a7ae0d8b", "id": "986fbb31a7ae0d8b",
"metadata": { "metadata": {
"jupyter": { "jupyter": {
"is_executing": true "is_executing": true
} }
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" desc \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
"2 The most-played game on Steam. Every day, mill... \n",
"3 When a young street hustler, a retired bank ro... \n",
"4 Edition Comparison Ultimate Edition The Tom Cl... \n",
"\n",
" genres \n",
"0 ['Action', 'Free To Play'] \n",
"1 ['Action', 'Adventure', 'Massively Multiplayer... \n",
"2 ['Action', 'Strategy', 'Free To Play'] \n",
"3 ['Action', 'Adventure'] \n",
"4 ['Action'] \n"
]
}
],
"source": [ "source": [
"from sklearn.compose import ColumnTransformer\n", "from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import FunctionTransformer\n", "from sklearn.preprocessing import FunctionTransformer\n",
@@ -121,7 +170,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 22,
"id": "44239f6b7fd23cde", "id": "44239f6b7fd23cde",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -148,10 +197,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 23,
"id": "ebc5a24e9bc87fdd", "id": "ebc5a24e9bc87fdd",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 [Action, Free To Play]\n",
"1 [Action, Adventure, Massively Multiplayer, Fre...\n",
"2 [Action, Strategy, Free To Play]\n",
"3 [Action, Adventure]\n",
"4 [Action]\n",
"Name: genres, dtype: object\n"
]
}
],
"source": [ "source": [
"import ast\n", "import ast\n",
"\n", "\n",
@@ -171,10 +233,30 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 24,
"id": "d2c3527a5fc876bf", "id": "d2c3527a5fc876bf",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Action Adventure Casual Early Access Free To Play Gore Indie \\\n",
"0 1 0 0 0 1 0 0 \n",
"1 1 1 0 0 1 0 0 \n",
"2 1 0 0 0 1 0 0 \n",
"3 1 1 0 0 0 0 0 \n",
"4 1 0 0 0 0 0 0 \n",
"\n",
" Massively Multiplayer RPG Racing Simulation Sports Strategy Violent \n",
"0 0 0 0 0 0 0 0 \n",
"1 1 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 1 0 \n",
"3 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 \n"
]
}
],
"source": [ "source": [
"from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.preprocessing import MultiLabelBinarizer\n",
"\n", "\n",
@@ -203,10 +285,32 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 25,
"id": "4e8b407c", "id": "4e8b407c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n",
"0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"\n",
" 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
"[5 rows x 29351 columns]\n"
]
}
],
"source": [ "source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n", "\n",
@@ -226,7 +330,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 26,
"id": "86d9da42f4df8e49", "id": "86d9da42f4df8e49",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -243,22 +347,62 @@
"## The Model\n", "## The Model\n",
"\n", "\n",
"#### Removing unpredicatble Datapoints\n", "#### Removing unpredicatble Datapoints\n",
"Some Datapoints don't have a genre assigned (all feature values in y are 0). The model we use can't handle such cases, thus they have to be removed.\n", "\n",
"Some genres have too little datapoints to be predictable. The 10k Dataset has 12 Classes that have less than 5 Datapoints, usually only 1 oder 2. These have too big of a probability that they will fall into only the train or test data and therefore will be removed. "
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "e1bc73d4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Before(1999, 14)\n",
"After(1999, 12)\n"
]
}
],
"source": [
"# remove genres that have less than min_entries entries -> probability of broken split to big\n",
"mask = (y == 1).sum() >= 5\n",
"print(\"Before\" + str(y.shape))\n",
"y_prep = y.loc[:, mask]\n",
"print(\"After\" + str(y_prep.shape))"
]
},
{
"cell_type": "markdown",
"id": "2fa60e6b",
"metadata": {},
"source": [
"Some Datapoints don't have a genre assigned (all feature values in y are 0, either from the start or after we removed them one step before). The model we use can't handle such cases, thus they have to be removed.\n",
"We filter after all values that we can use with a mask, and apply that mask to our matrices." "We filter after all values that we can use with a mask, and apply that mask to our matrices."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 28,
"id": "4919bf1b37d171a7", "id": "4919bf1b37d171a7",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"13\n"
]
}
],
"source": [ "source": [
"mask = y.sum(axis=1).map(lambda x: x > 0)\n", "mask = y.sum(axis=1).map(lambda x: x > 0)\n",
"print((mask == False).sum()) # count of unpredictable datapoints\n", "print((mask == False).sum()) # count of unpredictable datapoints\n",
"\n", "\n",
"X_clean = X[mask]\n", "X_clean = X[mask]\n",
"y_clean = y[mask]" "y_clean = y_prep[mask]"
] ]
}, },
{ {
@@ -273,7 +417,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 29,
"id": "cfbf3787", "id": "cfbf3787",
"metadata": { "metadata": {
"jupyter": { "jupyter": {
@@ -287,6 +431,62 @@
"X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)" "X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
] ]
}, },
{
"cell_type": "markdown",
"id": "8cd4bb54",
"metadata": {},
"source": [
"We also do a little cleanup session before proceeding."
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "0b0a46a4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1905"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import gc\n",
"\n",
"# Initial dataset loading\n",
"del dataset\n",
"del column_transformer\n",
"\n",
"# preparation of y\n",
"del mlb_genres\n",
"del genres_encoded\n",
"del genres_df\n",
"\n",
"# preparation of X\n",
"del tfidf_df\n",
"del vectorizer\n",
"del tfidf_matrix\n",
"\n",
"# Initial Dataset\n",
"del X\n",
"del y\n",
"# Removing Genres with less than 5 datapoints\n",
"del y_prep\n",
"\n",
"# Sorting out dead datapoints (all target values are 0)\n",
"del X_clean\n",
"del y_clean\n",
"del mask\n",
"\n",
"gc.collect()"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "84f56229", "id": "84f56229",
@@ -360,7 +560,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 31,
"id": "8c1d72c4532bd509", "id": "8c1d72c4532bd509",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -387,10 +587,37 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 32,
"id": "e2ebea6945193e07", "id": "e2ebea6945193e07",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.78 0.91 0.84 300\n",
" 1 0.78 0.62 0.69 216\n",
" 2 1.00 0.03 0.07 86\n",
" 3 0.00 0.00 0.00 46\n",
" 4 1.00 0.04 0.07 83\n",
" 5 0.79 0.81 0.80 245\n",
" 6 0.00 0.00 0.00 42\n",
" 7 0.90 0.34 0.49 127\n",
" 8 0.00 0.00 0.00 12\n",
" 9 0.89 0.25 0.39 127\n",
" 10 0.00 0.00 0.00 14\n",
" 11 0.88 0.14 0.24 106\n",
"\n",
" micro avg 0.79 0.50 0.61 1404\n",
" macro avg 0.58 0.26 0.30 1404\n",
"weighted avg 0.77 0.50 0.53 1404\n",
" samples avg 0.77 0.56 0.60 1404\n",
"\n"
]
}
],
"source": [ "source": [
"from sklearn.metrics import classification_report\n", "from sklearn.metrics import classification_report\n",
"\n", "\n",