commit message
This commit is contained in:
BIN
compare_datasets_10k.png
Normal file
BIN
compare_datasets_10k.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 47 KiB |
BIN
compare_datasets_2k.png
Normal file
BIN
compare_datasets_2k.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 70 KiB |
BIN
compare_models_10k.png
Normal file
BIN
compare_models_10k.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 43 KiB |
@@ -120,13 +120,14 @@ datasets = [
|
|||||||
]
|
]
|
||||||
estimators = {
|
estimators = {
|
||||||
#"RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter),
|
#"RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter),
|
||||||
#"PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter),
|
"PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter),
|
||||||
#"Perceptron": Perceptron(random_state=0, max_iter=max_iter),
|
#"Perceptron": Perceptron(random_state=0, max_iter=max_iter),
|
||||||
#"SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter),
|
#"SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter),
|
||||||
#"NearestCentroid": NearestCentroid(),
|
#"NearestCentroid": NearestCentroid(),
|
||||||
#"LinearSVC": LinearSVC(random_state=0, max_iter=max_iter),
|
"LinearSVC": LinearSVC(random_state=0, max_iter=max_iter),
|
||||||
|
#"AdaBoost": AdaBoostClassifier(),
|
||||||
#"GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
|
#"GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
|
||||||
"HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter),
|
#"HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter),
|
||||||
#"LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
|
#"LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
|
||||||
#"MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/20), early_stopping=True),
|
#"MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/20), early_stopping=True),
|
||||||
}
|
}
|
||||||
|
|||||||
BIN
compare_models_10k_3.png
Normal file
BIN
compare_models_10k_3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 40 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 66 KiB After Width: | Height: | Size: 67 KiB |
131
notebook.ipynb
131
notebook.ipynb
@@ -16,7 +16,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": null,
|
||||||
"id": "3116b75f",
|
"id": "3116b75f",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
@@ -90,7 +90,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": null,
|
||||||
"id": "d159117377f3633c",
|
"id": "d159117377f3633c",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -113,7 +113,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 21,
|
"execution_count": null,
|
||||||
"id": "986fbb31a7ae0d8b",
|
"id": "986fbb31a7ae0d8b",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
@@ -170,7 +170,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": null,
|
||||||
"id": "44239f6b7fd23cde",
|
"id": "44239f6b7fd23cde",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -197,7 +197,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": null,
|
||||||
"id": "ebc5a24e9bc87fdd",
|
"id": "ebc5a24e9bc87fdd",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -233,7 +233,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 24,
|
"execution_count": null,
|
||||||
"id": "d2c3527a5fc876bf",
|
"id": "d2c3527a5fc876bf",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -285,7 +285,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 25,
|
"execution_count": null,
|
||||||
"id": "4e8b407c",
|
"id": "4e8b407c",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -330,7 +330,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 26,
|
"execution_count": null,
|
||||||
"id": "86d9da42f4df8e49",
|
"id": "86d9da42f4df8e49",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -348,12 +348,12 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"#### Removing unpredicatble Datapoints\n",
|
"#### Removing unpredicatble Datapoints\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Some genres have too little datapoints to be predictable. The 10k Dataset has 12 Classes that have less than 5 Datapoints, usually only 1 oder 2. These have too big of a probability that they will fall into only the train or test data and therefore will be removed. "
|
"Some genres have too little datapoints to be predictable. The 10k Dataset has 14 Classes that have less than 10 Datapoints, usually only 1 to 4. These have too big of a probability that they will fall into only the train or test data and therefore will be removed."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 27,
|
"execution_count": null,
|
||||||
"id": "e1bc73d4",
|
"id": "e1bc73d4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -368,7 +368,7 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# remove genres that have less than min_entries entries -> probability of broken split to big\n",
|
"# remove genres that have less than min_entries entries -> probability of broken split to big\n",
|
||||||
"mask = (y == 1).sum() >= 5\n",
|
"mask = (y == 1).sum() >= 10\n",
|
||||||
"print(\"Before\" + str(y.shape))\n",
|
"print(\"Before\" + str(y.shape))\n",
|
||||||
"y_prep = y.loc[:, mask]\n",
|
"y_prep = y.loc[:, mask]\n",
|
||||||
"print(\"After\" + str(y_prep.shape))"
|
"print(\"After\" + str(y_prep.shape))"
|
||||||
@@ -385,7 +385,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 28,
|
"execution_count": null,
|
||||||
"id": "4919bf1b37d171a7",
|
"id": "4919bf1b37d171a7",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -417,7 +417,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 29,
|
"execution_count": null,
|
||||||
"id": "cfbf3787",
|
"id": "cfbf3787",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
@@ -441,17 +441,17 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 30,
|
"execution_count": null,
|
||||||
"id": "0b0a46a4",
|
"id": "0b0a46a4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"1905"
|
"99"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 30,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@@ -502,7 +502,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# Excursion: Choosing a classification Model\n",
|
"# Excursion: Choosing a classification Model\n",
|
||||||
"``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n",
|
"``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n",
|
||||||
"As such, we tested many different models on the 2k Dataset and chose the 5 best performing ones for the big dataset.\n",
|
"As such, we tested many different models on the small dataset and chose the best performing ones for the big dataset.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"### Initial Comparison\n",
|
"### Initial Comparison\n",
|
||||||
"We won't put the comparison script in this notebook, but you can find it in the ``compare_models_2k.py`` file and try it out yourself.\n",
|
"We won't put the comparison script in this notebook, but you can find it in the ``compare_models_2k.py`` file and try it out yourself.\n",
|
||||||
@@ -515,36 +515,34 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"If we also compare Micro/Macro values, we see that all models have a much lower Macro-F1 than Micro/Weighted-F1. That is because the 2k Dataset does not contain enough datapoints for every class (test data for 2 classes is 0), so we should proceed to the 10k Dataset before making major choices.\n",
|
"If we also compare Micro/Macro values, we see that all models have a much lower Macro-F1 than Micro/Weighted-F1. That is because the Dataset does not contain enough datapoints for every class (test data for 2 classes is 0 in the 2k dataset), so we should proceed to the 10k Dataset.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The 10 best performing models which will run on the 10k Dataset with the same rules as before:\n",
|
"The 10 best performing models which will run on the 10k Dataset with the same rules as before:\n",
|
||||||
"1. NearestCentroid\n",
|
"1. PassiveAggressiveClassifier \n",
|
||||||
"2. Perceptron\n",
|
"2. Perceptron\n",
|
||||||
"3. PassiveAggressiveClassifier\n",
|
"3. LinearSVC\n",
|
||||||
"4. LinearSVC\n",
|
"4. SDGClassifer\n",
|
||||||
"5. SDGClassifer\n",
|
"5. HistGradientBoostingClassifier\n",
|
||||||
"6. HistGradientBoostingClassifier\n",
|
"6. NearestCentroid\n",
|
||||||
"7. MLPClassifier\n",
|
"7. MLPClassifier\n",
|
||||||
"8. RidgeClassifier\n",
|
"8. GradientBoostingClassifier \n",
|
||||||
"9. GradientBoostingClassifier\n",
|
"9. RidgeClassifier\n",
|
||||||
"10. LinearDiscriminationAnalysis\n",
|
"10. AdaBoostClassifier (because of an evaluation mistake, we used LinearDiscriminantAnalysis instead)\n",
|
||||||
|
"\n",
|
||||||
|
"That gave us the following results:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"We can also compare these models between datasets, to see if a bigger dataset always improves the performance.\n",
|
"The top 5 are the same, with the only exception of Perceptron falling behind against the RidgeClassifier.\n",
|
||||||
|
"When comparing these models between datasets, it is evident that a bigger dataset yields better performance (for exponentially higher compute and time cost). Only NearestCentroid lost performance when comparing the Datasets.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The final contenders are:\n",
|
"The final contenders are LinearSVC and PassiveAggressiveClassifier, which we would compare against each other using k-fold cross validation with different hyperparameters, but since training the model on the dataset takes a lot of time and a big strain on our computers, we will stop here and use the LinearSVC Classifier."
|
||||||
"1.\n",
|
|
||||||
"2.\n",
|
|
||||||
"3.\n",
|
|
||||||
"4.\n",
|
|
||||||
"5.\n",
|
|
||||||
"\n",
|
|
||||||
"..."
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -553,23 +551,27 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Model Selection\n",
|
"## Model Selection\n",
|
||||||
"**TODO Deciding which model to use for this task**\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LogisticRegression``"
|
"As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LinearSVC``"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 31,
|
"execution_count": null,
|
||||||
"id": "8c1d72c4532bd509",
|
"id": "8c1d72c4532bd509",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": []
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.linear_model import LogisticRegression\n",
|
"from sklearn.svm import LinearSVC\n",
|
||||||
"from sklearn.multioutput import MultiOutputClassifier\n",
|
"from sklearn.multioutput import MultiOutputClassifier\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is to bad)\n",
|
"multi_target_clf = MultiOutputClassifier(LinearSVC(max_iter=1337, random_state=0), n_jobs=1)\n",
|
||||||
"multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"multi_target_clf.fit(X_train, y_train)\n",
|
"multi_target_clf.fit(X_train, y_train)\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -587,7 +589,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 32,
|
"execution_count": null,
|
||||||
"id": "e2ebea6945193e07",
|
"id": "e2ebea6945193e07",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -597,23 +599,23 @@
|
|||||||
"text": [
|
"text": [
|
||||||
" precision recall f1-score support\n",
|
" precision recall f1-score support\n",
|
||||||
"\n",
|
"\n",
|
||||||
" 0 0.78 0.91 0.84 300\n",
|
" 0 0.84 0.86 0.85 300\n",
|
||||||
" 1 0.78 0.62 0.69 216\n",
|
" 1 0.74 0.63 0.68 216\n",
|
||||||
" 2 1.00 0.03 0.07 86\n",
|
" 2 0.77 0.31 0.45 86\n",
|
||||||
" 3 0.00 0.00 0.00 46\n",
|
" 3 0.50 0.04 0.08 46\n",
|
||||||
" 4 1.00 0.04 0.07 83\n",
|
" 4 0.69 0.33 0.44 83\n",
|
||||||
" 5 0.79 0.81 0.80 245\n",
|
" 5 0.79 0.80 0.79 245\n",
|
||||||
" 6 0.00 0.00 0.00 42\n",
|
" 6 0.69 0.26 0.38 42\n",
|
||||||
" 7 0.90 0.34 0.49 127\n",
|
" 7 0.74 0.62 0.68 127\n",
|
||||||
" 8 0.00 0.00 0.00 12\n",
|
" 8 1.00 0.67 0.80 12\n",
|
||||||
" 9 0.89 0.25 0.39 127\n",
|
" 9 0.80 0.57 0.67 127\n",
|
||||||
" 10 0.00 0.00 0.00 14\n",
|
" 10 1.00 0.50 0.67 14\n",
|
||||||
" 11 0.88 0.14 0.24 106\n",
|
" 11 0.79 0.46 0.58 106\n",
|
||||||
"\n",
|
"\n",
|
||||||
" micro avg 0.79 0.50 0.61 1404\n",
|
" micro avg 0.79 0.62 0.69 1404\n",
|
||||||
" macro avg 0.58 0.26 0.30 1404\n",
|
" macro avg 0.78 0.51 0.59 1404\n",
|
||||||
"weighted avg 0.77 0.50 0.53 1404\n",
|
"weighted avg 0.77 0.62 0.67 1404\n",
|
||||||
" samples avg 0.77 0.56 0.60 1404\n",
|
" samples avg 0.80 0.68 0.70 1404\n",
|
||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -633,15 +635,6 @@
|
|||||||
"**TODO optimize the model based on the test results**"
|
"**TODO optimize the model based on the test results**"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "79b20645",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Validation\n",
|
|
||||||
"**TODO Predict actual values**"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "3b709fb7",
|
"id": "3b709fb7",
|
||||||
|
|||||||
Reference in New Issue
Block a user