Merge remote-tracking branch 'origin/main'
This commit is contained in:
21
games_march2025_cleaned_10k/LinearDiscriminantAnalysis.txt
Normal file
21
games_march2025_cleaned_10k/LinearDiscriminantAnalysis.txt
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
precision recall f1-score support
|
||||||
|
|
||||||
|
0 0.74 0.74 0.74 1109
|
||||||
|
1 0.67 0.63 0.65 1107
|
||||||
|
2 0.58 0.41 0.48 686
|
||||||
|
3 0.09 0.53 0.15 192
|
||||||
|
4 0.27 0.30 0.29 369
|
||||||
|
5 0.00 0.00 0.00 2
|
||||||
|
6 0.77 0.84 0.81 1576
|
||||||
|
7 0.06 0.44 0.10 135
|
||||||
|
8 0.58 0.45 0.50 707
|
||||||
|
9 0.92 0.63 0.75 91
|
||||||
|
10 0.74 0.54 0.63 682
|
||||||
|
11 0.12 0.44 0.19 112
|
||||||
|
12 0.70 0.52 0.60 562
|
||||||
|
13 0.00 0.00 0.00 5
|
||||||
|
|
||||||
|
micro avg 0.51 0.61 0.55 7335
|
||||||
|
macro avg 0.45 0.46 0.42 7335
|
||||||
|
weighted avg 0.64 0.61 0.61 7335
|
||||||
|
samples avg 0.54 0.65 0.55 7335
|
||||||
21
games_march2025_cleaned_10k/MLPClassifier.txt
Normal file
21
games_march2025_cleaned_10k/MLPClassifier.txt
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
precision recall f1-score support
|
||||||
|
|
||||||
|
0 0.82 0.73 0.77 1109
|
||||||
|
1 0.73 0.69 0.71 1107
|
||||||
|
2 0.71 0.43 0.53 686
|
||||||
|
3 0.73 0.06 0.11 192
|
||||||
|
4 0.73 0.22 0.34 369
|
||||||
|
5 0.00 0.00 0.00 2
|
||||||
|
6 0.78 0.93 0.85 1576
|
||||||
|
7 0.85 0.21 0.34 135
|
||||||
|
8 0.79 0.55 0.65 707
|
||||||
|
9 0.98 0.57 0.72 91
|
||||||
|
10 0.88 0.47 0.61 682
|
||||||
|
11 0.93 0.46 0.61 112
|
||||||
|
12 0.81 0.57 0.67 562
|
||||||
|
13 0.00 0.00 0.00 5
|
||||||
|
|
||||||
|
micro avg 0.78 0.62 0.69 7335
|
||||||
|
macro avg 0.69 0.42 0.49 7335
|
||||||
|
weighted avg 0.78 0.62 0.67 7335
|
||||||
|
samples avg 0.79 0.68 0.69 7335
|
||||||
269
notebook.ipynb
269
notebook.ipynb
@@ -16,14 +16,43 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 19,
|
||||||
"id": "3116b75f",
|
"id": "3116b75f",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
"is_executing": true
|
"is_executing": true
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" appid name release_date required_age price dlc_count \\\n",
|
||||||
|
"0 730 Counter-Strike 2 2012-08-21 0 0.0 1 \n",
|
||||||
|
"\n",
|
||||||
|
" detailed_description \\\n",
|
||||||
|
"0 For over two decades, Counter-Strike has offer... \n",
|
||||||
|
"\n",
|
||||||
|
" about_the_game \\\n",
|
||||||
|
"0 For over two decades, Counter-Strike has offer... \n",
|
||||||
|
"\n",
|
||||||
|
" short_description reviews ... \\\n",
|
||||||
|
"0 For over two decades, Counter-Strike has offer... NaN ... \n",
|
||||||
|
"\n",
|
||||||
|
" average_playtime_2weeks median_playtime_forever median_playtime_2weeks \\\n",
|
||||||
|
"0 879 5174 350 \n",
|
||||||
|
"\n",
|
||||||
|
" discount peak_ccu tags \\\n",
|
||||||
|
"0 0 1212356 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... \n",
|
||||||
|
"\n",
|
||||||
|
" pct_pos_total num_reviews_total pct_pos_recent num_reviews_recent \n",
|
||||||
|
"0 86 8632939 82 96473 \n",
|
||||||
|
"\n",
|
||||||
|
"[1 rows x 47 columns]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
@@ -61,7 +90,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 20,
|
||||||
"id": "d159117377f3633c",
|
"id": "d159117377f3633c",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -84,14 +113,34 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 21,
|
||||||
"id": "986fbb31a7ae0d8b",
|
"id": "986fbb31a7ae0d8b",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
"is_executing": true
|
"is_executing": true
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" desc \\\n",
|
||||||
|
"0 For over two decades, Counter-Strike has offer... \n",
|
||||||
|
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
|
||||||
|
"2 The most-played game on Steam. Every day, mill... \n",
|
||||||
|
"3 When a young street hustler, a retired bank ro... \n",
|
||||||
|
"4 Edition Comparison Ultimate Edition The Tom Cl... \n",
|
||||||
|
"\n",
|
||||||
|
" genres \n",
|
||||||
|
"0 ['Action', 'Free To Play'] \n",
|
||||||
|
"1 ['Action', 'Adventure', 'Massively Multiplayer... \n",
|
||||||
|
"2 ['Action', 'Strategy', 'Free To Play'] \n",
|
||||||
|
"3 ['Action', 'Adventure'] \n",
|
||||||
|
"4 ['Action'] \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.compose import ColumnTransformer\n",
|
"from sklearn.compose import ColumnTransformer\n",
|
||||||
"from sklearn.preprocessing import FunctionTransformer\n",
|
"from sklearn.preprocessing import FunctionTransformer\n",
|
||||||
@@ -121,7 +170,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 22,
|
||||||
"id": "44239f6b7fd23cde",
|
"id": "44239f6b7fd23cde",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -148,10 +197,23 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 23,
|
||||||
"id": "ebc5a24e9bc87fdd",
|
"id": "ebc5a24e9bc87fdd",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0 [Action, Free To Play]\n",
|
||||||
|
"1 [Action, Adventure, Massively Multiplayer, Fre...\n",
|
||||||
|
"2 [Action, Strategy, Free To Play]\n",
|
||||||
|
"3 [Action, Adventure]\n",
|
||||||
|
"4 [Action]\n",
|
||||||
|
"Name: genres, dtype: object\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import ast\n",
|
"import ast\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -171,10 +233,30 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 24,
|
||||||
"id": "d2c3527a5fc876bf",
|
"id": "d2c3527a5fc876bf",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" Action Adventure Casual Early Access Free To Play Gore Indie \\\n",
|
||||||
|
"0 1 0 0 0 1 0 0 \n",
|
||||||
|
"1 1 1 0 0 1 0 0 \n",
|
||||||
|
"2 1 0 0 0 1 0 0 \n",
|
||||||
|
"3 1 1 0 0 0 0 0 \n",
|
||||||
|
"4 1 0 0 0 0 0 0 \n",
|
||||||
|
"\n",
|
||||||
|
" Massively Multiplayer RPG Racing Simulation Sports Strategy Violent \n",
|
||||||
|
"0 0 0 0 0 0 0 0 \n",
|
||||||
|
"1 1 0 0 0 0 0 0 \n",
|
||||||
|
"2 0 0 0 0 0 1 0 \n",
|
||||||
|
"3 0 0 0 0 0 0 0 \n",
|
||||||
|
"4 0 0 0 0 0 0 0 \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -203,10 +285,32 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 25,
|
||||||
"id": "4e8b407c",
|
"id": "4e8b407c",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n",
|
||||||
|
"0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||||
|
"1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||||
|
"2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||||
|
"3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||||
|
"4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||||
|
"\n",
|
||||||
|
" 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n",
|
||||||
|
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||||
|
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||||
|
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||||
|
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||||
|
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||||
|
"\n",
|
||||||
|
"[5 rows x 29351 columns]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -226,7 +330,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 26,
|
||||||
"id": "86d9da42f4df8e49",
|
"id": "86d9da42f4df8e49",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -243,22 +347,62 @@
|
|||||||
"## The Model\n",
|
"## The Model\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#### Removing unpredicatble Datapoints\n",
|
"#### Removing unpredicatble Datapoints\n",
|
||||||
"Some Datapoints don't have a genre assigned (all feature values in y are 0). The model we use can't handle such cases, thus they have to be removed.\n",
|
"\n",
|
||||||
|
"Some genres have too little datapoints to be predictable. The 10k Dataset has 12 Classes that have less than 5 Datapoints, usually only 1 oder 2. These have too big of a probability that they will fall into only the train or test data and therefore will be removed. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"id": "e1bc73d4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Before(1999, 14)\n",
|
||||||
|
"After(1999, 12)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# remove genres that have less than min_entries entries -> probability of broken split to big\n",
|
||||||
|
"mask = (y == 1).sum() >= 5\n",
|
||||||
|
"print(\"Before\" + str(y.shape))\n",
|
||||||
|
"y_prep = y.loc[:, mask]\n",
|
||||||
|
"print(\"After\" + str(y_prep.shape))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "2fa60e6b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Some Datapoints don't have a genre assigned (all feature values in y are 0, either from the start or after we removed them one step before). The model we use can't handle such cases, thus they have to be removed.\n",
|
||||||
"We filter after all values that we can use with a mask, and apply that mask to our matrices."
|
"We filter after all values that we can use with a mask, and apply that mask to our matrices."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 28,
|
||||||
"id": "4919bf1b37d171a7",
|
"id": "4919bf1b37d171a7",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"13\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"mask = y.sum(axis=1).map(lambda x: x > 0)\n",
|
"mask = y.sum(axis=1).map(lambda x: x > 0)\n",
|
||||||
"print((mask == False).sum()) # count of unpredictable datapoints\n",
|
"print((mask == False).sum()) # count of unpredictable datapoints\n",
|
||||||
"\n",
|
"\n",
|
||||||
"X_clean = X[mask]\n",
|
"X_clean = X[mask]\n",
|
||||||
"y_clean = y[mask]"
|
"y_clean = y_prep[mask]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -273,7 +417,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 29,
|
||||||
"id": "cfbf3787",
|
"id": "cfbf3787",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
@@ -287,6 +431,62 @@
|
|||||||
"X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
|
"X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "8cd4bb54",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We also do a little cleanup session before proceeding."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"id": "0b0a46a4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1905"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import gc\n",
|
||||||
|
"\n",
|
||||||
|
"# Initial dataset loading\n",
|
||||||
|
"del dataset\n",
|
||||||
|
"del column_transformer\n",
|
||||||
|
"\n",
|
||||||
|
"# preparation of y\n",
|
||||||
|
"del mlb_genres\n",
|
||||||
|
"del genres_encoded\n",
|
||||||
|
"del genres_df\n",
|
||||||
|
"\n",
|
||||||
|
"# preparation of X\n",
|
||||||
|
"del tfidf_df\n",
|
||||||
|
"del vectorizer\n",
|
||||||
|
"del tfidf_matrix\n",
|
||||||
|
"\n",
|
||||||
|
"# Initial Dataset\n",
|
||||||
|
"del X\n",
|
||||||
|
"del y\n",
|
||||||
|
"# Removing Genres with less than 5 datapoints\n",
|
||||||
|
"del y_prep\n",
|
||||||
|
"\n",
|
||||||
|
"# Sorting out dead datapoints (all target values are 0)\n",
|
||||||
|
"del X_clean\n",
|
||||||
|
"del y_clean\n",
|
||||||
|
"del mask\n",
|
||||||
|
"\n",
|
||||||
|
"gc.collect()"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "84f56229",
|
"id": "84f56229",
|
||||||
@@ -360,7 +560,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 31,
|
||||||
"id": "8c1d72c4532bd509",
|
"id": "8c1d72c4532bd509",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -387,10 +587,37 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 32,
|
||||||
"id": "e2ebea6945193e07",
|
"id": "e2ebea6945193e07",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 0.78 0.91 0.84 300\n",
|
||||||
|
" 1 0.78 0.62 0.69 216\n",
|
||||||
|
" 2 1.00 0.03 0.07 86\n",
|
||||||
|
" 3 0.00 0.00 0.00 46\n",
|
||||||
|
" 4 1.00 0.04 0.07 83\n",
|
||||||
|
" 5 0.79 0.81 0.80 245\n",
|
||||||
|
" 6 0.00 0.00 0.00 42\n",
|
||||||
|
" 7 0.90 0.34 0.49 127\n",
|
||||||
|
" 8 0.00 0.00 0.00 12\n",
|
||||||
|
" 9 0.89 0.25 0.39 127\n",
|
||||||
|
" 10 0.00 0.00 0.00 14\n",
|
||||||
|
" 11 0.88 0.14 0.24 106\n",
|
||||||
|
"\n",
|
||||||
|
" micro avg 0.79 0.50 0.61 1404\n",
|
||||||
|
" macro avg 0.58 0.26 0.30 1404\n",
|
||||||
|
"weighted avg 0.77 0.50 0.53 1404\n",
|
||||||
|
" samples avg 0.77 0.56 0.60 1404\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.metrics import classification_report\n",
|
"from sklearn.metrics import classification_report\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|||||||
Reference in New Issue
Block a user