mismades were taken
This commit is contained in:
183
notebook.ipynb
183
notebook.ipynb
@@ -23,36 +23,7 @@
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" appid name release_date required_age price dlc_count \\\n",
|
||||
"0 730 Counter-Strike 2 2012-08-21 0 0.0 1 \n",
|
||||
"\n",
|
||||
" detailed_description \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... \n",
|
||||
"\n",
|
||||
" about_the_game \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... \n",
|
||||
"\n",
|
||||
" short_description reviews ... \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... NaN ... \n",
|
||||
"\n",
|
||||
" average_playtime_2weeks median_playtime_forever median_playtime_2weeks \\\n",
|
||||
"0 879 5174 350 \n",
|
||||
"\n",
|
||||
" discount peak_ccu tags \\\n",
|
||||
"0 0 1212356 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... \n",
|
||||
"\n",
|
||||
" pct_pos_total num_reviews_total pct_pos_recent num_reviews_recent \n",
|
||||
"0 86 8632939 82 96473 \n",
|
||||
"\n",
|
||||
"[1 rows x 47 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
@@ -120,27 +91,7 @@
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" desc \\\n",
|
||||
"0 For over two decades, Counter-Strike has offer... \n",
|
||||
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
|
||||
"2 The most-played game on Steam. Every day, mill... \n",
|
||||
"3 When a young street hustler, a retired bank ro... \n",
|
||||
"4 Edition Comparison Ultimate Edition The Tom Cl... \n",
|
||||
"\n",
|
||||
" genres \n",
|
||||
"0 ['Action', 'Free To Play'] \n",
|
||||
"1 ['Action', 'Adventure', 'Massively Multiplayer... \n",
|
||||
"2 ['Action', 'Strategy', 'Free To Play'] \n",
|
||||
"3 ['Action', 'Adventure'] \n",
|
||||
"4 ['Action'] \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.compose import ColumnTransformer\n",
|
||||
"from sklearn.preprocessing import FunctionTransformer\n",
|
||||
@@ -200,20 +151,7 @@
|
||||
"execution_count": null,
|
||||
"id": "ebc5a24e9bc87fdd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0 [Action, Free To Play]\n",
|
||||
"1 [Action, Adventure, Massively Multiplayer, Fre...\n",
|
||||
"2 [Action, Strategy, Free To Play]\n",
|
||||
"3 [Action, Adventure]\n",
|
||||
"4 [Action]\n",
|
||||
"Name: genres, dtype: object\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import ast\n",
|
||||
"\n",
|
||||
@@ -236,27 +174,7 @@
|
||||
"execution_count": null,
|
||||
"id": "d2c3527a5fc876bf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Action Adventure Casual Early Access Free To Play Gore Indie \\\n",
|
||||
"0 1 0 0 0 1 0 0 \n",
|
||||
"1 1 1 0 0 1 0 0 \n",
|
||||
"2 1 0 0 0 1 0 0 \n",
|
||||
"3 1 1 0 0 0 0 0 \n",
|
||||
"4 1 0 0 0 0 0 0 \n",
|
||||
"\n",
|
||||
" Massively Multiplayer RPG Racing Simulation Sports Strategy Violent \n",
|
||||
"0 0 0 0 0 0 0 0 \n",
|
||||
"1 1 0 0 0 0 0 0 \n",
|
||||
"2 0 0 0 0 0 1 0 \n",
|
||||
"3 0 0 0 0 0 0 0 \n",
|
||||
"4 0 0 0 0 0 0 0 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
||||
"\n",
|
||||
@@ -288,29 +206,7 @@
|
||||
"execution_count": null,
|
||||
"id": "4e8b407c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n",
|
||||
"0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"\n",
|
||||
" 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n",
|
||||
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 29351 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
@@ -356,15 +252,7 @@
|
||||
"execution_count": null,
|
||||
"id": "4919bf1b37d171a7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"13\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mask = y.sum(axis=1).map(lambda x: x > 0)\n",
|
||||
"print((mask == False).sum()) # count of unpredictable datapoints\n",
|
||||
@@ -399,12 +287,38 @@
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "84f56229",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now that all data is prepared, we need to choose a Classification Model that meets our stanadrds."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "917ba82f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Excursion: Choosing a classification Model\n",
|
||||
"``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n",
|
||||
"As such, we tested many different models on the 2k Dataset and chose the 5 best performing ones for the big dataset.\n",
|
||||
"\n",
|
||||
"### The comparison\n",
|
||||
"We won't put the comparison script in this notebook, but you can find it in the ``compare_models.py`` file and try it out yourself.\n",
|
||||
"There were some rules as a baseline for comparison:\n",
|
||||
"- All Hyperparameters are set to default\n",
|
||||
"- All iteration limits are set to 3000\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "12b5283d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Model Selection\n",
|
||||
"## Model Selection\n",
|
||||
"**TODO Deciding which model to use for this task**\n",
|
||||
"\n",
|
||||
"As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LogisticRegression``"
|
||||
@@ -442,36 +356,7 @@
|
||||
"execution_count": null,
|
||||
"id": "e2ebea6945193e07",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.78 0.91 0.84 300\n",
|
||||
" 1 0.78 0.62 0.69 216\n",
|
||||
" 2 1.00 0.03 0.07 86\n",
|
||||
" 3 0.00 0.00 0.00 46\n",
|
||||
" 4 1.00 0.04 0.07 83\n",
|
||||
" 5 0.00 0.00 0.00 0\n",
|
||||
" 6 0.79 0.81 0.80 245\n",
|
||||
" 7 0.00 0.00 0.00 42\n",
|
||||
" 8 0.90 0.34 0.49 127\n",
|
||||
" 9 0.00 0.00 0.00 12\n",
|
||||
" 10 0.89 0.25 0.39 127\n",
|
||||
" 11 0.00 0.00 0.00 14\n",
|
||||
" 12 0.88 0.14 0.24 106\n",
|
||||
" 13 0.00 0.00 0.00 0\n",
|
||||
"\n",
|
||||
" micro avg 0.79 0.50 0.61 1404\n",
|
||||
" macro avg 0.50 0.22 0.26 1404\n",
|
||||
"weighted avg 0.77 0.50 0.53 1404\n",
|
||||
" samples avg 0.77 0.56 0.60 1404\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.metrics import classification_report\n",
|
||||
"\n",
|
||||
|
||||
Reference in New Issue
Block a user