mismades were taken

This commit is contained in:
Tim
2025-08-18 14:14:34 +02:00
parent 3975cdf7e8
commit 28df88c0bf
38 changed files with 70 additions and 878 deletions

View File

@@ -23,36 +23,7 @@
"is_executing": true
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" appid name release_date required_age price dlc_count \\\n",
"0 730 Counter-Strike 2 2012-08-21 0 0.0 1 \n",
"\n",
" detailed_description \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"\n",
" about_the_game \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"\n",
" short_description reviews ... \\\n",
"0 For over two decades, Counter-Strike has offer... NaN ... \n",
"\n",
" average_playtime_2weeks median_playtime_forever median_playtime_2weeks \\\n",
"0 879 5174 350 \n",
"\n",
" discount peak_ccu tags \\\n",
"0 0 1212356 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... \n",
"\n",
" pct_pos_total num_reviews_total pct_pos_recent num_reviews_recent \n",
"0 86 8632939 82 96473 \n",
"\n",
"[1 rows x 47 columns]\n"
]
}
],
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
@@ -120,27 +91,7 @@
"is_executing": true
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" desc \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
"2 The most-played game on Steam. Every day, mill... \n",
"3 When a young street hustler, a retired bank ro... \n",
"4 Edition Comparison Ultimate Edition The Tom Cl... \n",
"\n",
" genres \n",
"0 ['Action', 'Free To Play'] \n",
"1 ['Action', 'Adventure', 'Massively Multiplayer... \n",
"2 ['Action', 'Strategy', 'Free To Play'] \n",
"3 ['Action', 'Adventure'] \n",
"4 ['Action'] \n"
]
}
],
"outputs": [],
"source": [
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import FunctionTransformer\n",
@@ -200,20 +151,7 @@
"execution_count": null,
"id": "ebc5a24e9bc87fdd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 [Action, Free To Play]\n",
"1 [Action, Adventure, Massively Multiplayer, Fre...\n",
"2 [Action, Strategy, Free To Play]\n",
"3 [Action, Adventure]\n",
"4 [Action]\n",
"Name: genres, dtype: object\n"
]
}
],
"outputs": [],
"source": [
"import ast\n",
"\n",
@@ -236,27 +174,7 @@
"execution_count": null,
"id": "d2c3527a5fc876bf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Action Adventure Casual Early Access Free To Play Gore Indie \\\n",
"0 1 0 0 0 1 0 0 \n",
"1 1 1 0 0 1 0 0 \n",
"2 1 0 0 0 1 0 0 \n",
"3 1 1 0 0 0 0 0 \n",
"4 1 0 0 0 0 0 0 \n",
"\n",
" Massively Multiplayer RPG Racing Simulation Sports Strategy Violent \n",
"0 0 0 0 0 0 0 0 \n",
"1 1 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 1 0 \n",
"3 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 \n"
]
}
],
"outputs": [],
"source": [
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"\n",
@@ -288,29 +206,7 @@
"execution_count": null,
"id": "4e8b407c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n",
"0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"\n",
" 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
"[5 rows x 29351 columns]\n"
]
}
],
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
@@ -356,15 +252,7 @@
"execution_count": null,
"id": "4919bf1b37d171a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"13\n"
]
}
],
"outputs": [],
"source": [
"mask = y.sum(axis=1).map(lambda x: x > 0)\n",
"print((mask == False).sum()) # count of unpredictable datapoints\n",
@@ -399,12 +287,38 @@
"X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
]
},
{
"cell_type": "markdown",
"id": "84f56229",
"metadata": {},
"source": [
"Now that all data is prepared, we need to choose a Classification Model that meets our stanadrds."
]
},
{
"cell_type": "markdown",
"id": "917ba82f",
"metadata": {},
"source": [
"# Excursion: Choosing a classification Model\n",
"``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n",
"As such, we tested many different models on the 2k Dataset and chose the 5 best performing ones for the big dataset.\n",
"\n",
"### The comparison\n",
"We won't put the comparison script in this notebook, but you can find it in the ``compare_models.py`` file and try it out yourself.\n",
"There were some rules as a baseline for comparison:\n",
"- All Hyperparameters are set to default\n",
"- All iteration limits are set to 3000\n",
"\n",
"![Comparison Image](./compare_models_2k.png)"
]
},
{
"cell_type": "markdown",
"id": "12b5283d",
"metadata": {},
"source": [
"# Model Selection\n",
"## Model Selection\n",
"**TODO Deciding which model to use for this task**\n",
"\n",
"As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LogisticRegression``"
@@ -442,36 +356,7 @@
"execution_count": null,
"id": "e2ebea6945193e07",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.78 0.91 0.84 300\n",
" 1 0.78 0.62 0.69 216\n",
" 2 1.00 0.03 0.07 86\n",
" 3 0.00 0.00 0.00 46\n",
" 4 1.00 0.04 0.07 83\n",
" 5 0.00 0.00 0.00 0\n",
" 6 0.79 0.81 0.80 245\n",
" 7 0.00 0.00 0.00 42\n",
" 8 0.90 0.34 0.49 127\n",
" 9 0.00 0.00 0.00 12\n",
" 10 0.89 0.25 0.39 127\n",
" 11 0.00 0.00 0.00 14\n",
" 12 0.88 0.14 0.24 106\n",
" 13 0.00 0.00 0.00 0\n",
"\n",
" micro avg 0.79 0.50 0.61 1404\n",
" macro avg 0.50 0.22 0.26 1404\n",
"weighted avg 0.77 0.50 0.53 1404\n",
" samples avg 0.77 0.56 0.60 1404\n",
"\n"
]
}
],
"outputs": [],
"source": [
"from sklearn.metrics import classification_report\n",
"\n",