diff --git a/compare_datasets_10k.png b/compare_datasets_10k.png new file mode 100644 index 0000000..4b887e6 Binary files /dev/null and b/compare_datasets_10k.png differ diff --git a/compare_datasets_2k.png b/compare_datasets_2k.png new file mode 100644 index 0000000..a9b54eb Binary files /dev/null and b/compare_datasets_2k.png differ diff --git a/compare_models_10k.png b/compare_models_10k.png new file mode 100644 index 0000000..40fa4ef Binary files /dev/null and b/compare_models_10k.png differ diff --git a/compare_models_10k.py b/compare_models_10k.py index 1f28b02..d1bb368 100644 --- a/compare_models_10k.py +++ b/compare_models_10k.py @@ -120,13 +120,14 @@ datasets = [ ] estimators = { #"RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter), - #"PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter), + "PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter), #"Perceptron": Perceptron(random_state=0, max_iter=max_iter), #"SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter), #"NearestCentroid": NearestCentroid(), - #"LinearSVC": LinearSVC(random_state=0, max_iter=max_iter), + "LinearSVC": LinearSVC(random_state=0, max_iter=max_iter), + #"AdaBoost": AdaBoostClassifier(), #"GradientBoostingClassifier": GradientBoostingClassifier(random_state=0), - "HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter), + #"HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter), #"LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(), #"MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/20), early_stopping=True), } diff --git a/compare_models_10k_3.png b/compare_models_10k_3.png new file mode 100644 index 0000000..37470e0 Binary files /dev/null and b/compare_models_10k_3.png differ diff --git a/compare_models_2k.png b/compare_models_2k.png index 5e46552..15b1fe8 100644 Binary files a/compare_models_2k.png and b/compare_models_2k.png differ diff --git a/notebook.ipynb b/notebook.ipynb index aede9bb..8d439c5 100644 --- a/notebook.ipynb +++ b/notebook.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "3116b75f", "metadata": { "jupyter": { @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "d159117377f3633c", "metadata": {}, "outputs": [], @@ -113,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "986fbb31a7ae0d8b", "metadata": { "jupyter": { @@ -170,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "44239f6b7fd23cde", "metadata": {}, "outputs": [], @@ -197,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "ebc5a24e9bc87fdd", "metadata": {}, "outputs": [ @@ -233,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "d2c3527a5fc876bf", "metadata": {}, "outputs": [ @@ -285,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "4e8b407c", "metadata": {}, "outputs": [ @@ -330,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "86d9da42f4df8e49", "metadata": {}, "outputs": [], @@ -348,12 +348,12 @@ "\n", "#### Removing unpredicatble Datapoints\n", "\n", - "Some genres have too little datapoints to be predictable. The 10k Dataset has 12 Classes that have less than 5 Datapoints, usually only 1 oder 2. These have too big of a probability that they will fall into only the train or test data and therefore will be removed. " + "Some genres have too little datapoints to be predictable. The 10k Dataset has 14 Classes that have less than 10 Datapoints, usually only 1 to 4. These have too big of a probability that they will fall into only the train or test data and therefore will be removed." ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "e1bc73d4", "metadata": {}, "outputs": [ @@ -368,7 +368,7 @@ ], "source": [ "# remove genres that have less than min_entries entries -> probability of broken split to big\n", - "mask = (y == 1).sum() >= 5\n", + "mask = (y == 1).sum() >= 10\n", "print(\"Before\" + str(y.shape))\n", "y_prep = y.loc[:, mask]\n", "print(\"After\" + str(y_prep.shape))" @@ -385,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "4919bf1b37d171a7", "metadata": {}, "outputs": [ @@ -417,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "cfbf3787", "metadata": { "jupyter": { @@ -441,17 +441,17 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "0b0a46a4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "1905" + "99" ] }, - "execution_count": 30, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -502,49 +502,47 @@ "source": [ "# Excursion: Choosing a classification Model\n", "``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n", - "As such, we tested many different models on the 2k Dataset and chose the 5 best performing ones for the big dataset.\n", + "As such, we tested many different models on the small dataset and chose the best performing ones for the big dataset.\n", "\n", "### Initial Comparison\n", "We won't put the comparison script in this notebook, but you can find it in the ``compare_models_2k.py`` file and try it out yourself.\n", "There were some rules as a baseline for comparison:\n", "- All Hyperparameters are set to default\n", - "- All iteration limits are set to 3000 (exception: MLPClassifier with 300, where i-limit are epochs instead of iterations )\n", + "- All iteration limits are set to 3000 (exception: MLPClassifier with 300, where i-limit are epochs instead of iterations)\n", "- All ``random_state``s are set to 0\n", "\n", "Running all models with that configuration yields the following weighted F1-Scores (results as seen in the ``games_march2025_cleaned_2k_i3k`` folder): \n", "\n", "![Comparison Image 2k](./compare_models_2k.png)\n", "\n", - "If we also compare Micro/Macro values, we see that all models have a much lower Macro-F1 than Micro/Weighted-F1. That is because the 2k Dataset does not contain enough datapoints for every class (test data for 2 classes is 0), so we should proceed to the 10k Dataset before making major choices.\n", + "If we also compare Micro/Macro values, we see that all models have a much lower Macro-F1 than Micro/Weighted-F1. That is because the Dataset does not contain enough datapoints for every class (test data for 2 classes is 0 in the 2k dataset), so we should proceed to the 10k Dataset.\n", "\n", "![Comparison Image 2k Micro/Macro/Weighted](./compare_models_2k_3.png)\n", "\n", "The 10 best performing models which will run on the 10k Dataset with the same rules as before:\n", - "1. NearestCentroid\n", + "1. PassiveAggressiveClassifier \n", "2. Perceptron\n", - "3. PassiveAggressiveClassifier\n", - "4. LinearSVC\n", - "5. SDGClassifer\n", - "6. HistGradientBoostingClassifier\n", + "3. LinearSVC\n", + "4. SDGClassifer\n", + "5. HistGradientBoostingClassifier\n", + "6. NearestCentroid\n", "7. MLPClassifier\n", - "8. RidgeClassifier\n", - "9. GradientBoostingClassifier\n", - "10. LinearDiscriminationAnalysis\n", + "8. GradientBoostingClassifier \n", + "9. RidgeClassifier\n", + "10. AdaBoostClassifier (because of an evaluation mistake, we used LinearDiscriminantAnalysis instead)\n", + "\n", + "That gave us the following results:\n", "\n", "![Comparison Image 10k](./compare_models_10k.png)\n", + "![Comparison Image 10k](./compare_models_10k_3.png)\n", "\n", - "We can also compare these models between datasets, to see if a bigger dataset always improves the performance.\n", + "The top 5 are the same, with the only exception of Perceptron falling behind against the RidgeClassifier.\n", + "When comparing these models between datasets, it is evident that a bigger dataset yields better performance (for exponentially higher compute and time cost). Only NearestCentroid lost performance when comparing the Datasets.\n", "\n", - "![Comparison Image between 2k and 10k](./compare_models_2k_10k.png)\n", + "![Comparison Image between 2k and 10k](./compare_datasets_2k.png)\n", + "![Comparison Image between 2k and 10k, only 10k Models](./compare_datasets_10k.png)\n", "\n", - "The final contenders are:\n", - "1.\n", - "2.\n", - "3.\n", - "4.\n", - "5.\n", - "\n", - "..." + "The final contenders are LinearSVC and PassiveAggressiveClassifier, which we would compare against each other using k-fold cross validation with different hyperparameters, but since training the model on the dataset takes a lot of time and a big strain on our computers, we will stop here and use the LinearSVC Classifier." ] }, { @@ -553,23 +551,27 @@ "metadata": {}, "source": [ "## Model Selection\n", - "**TODO Deciding which model to use for this task**\n", "\n", - "As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LogisticRegression``" + "As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LinearSVC``" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "8c1d72c4532bd509", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [] + } + ], "source": [ - "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import LinearSVC\n", "from sklearn.multioutput import MultiOutputClassifier\n", "\n", - "# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is to bad)\n", - "multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)\n", + "multi_target_clf = MultiOutputClassifier(LinearSVC(max_iter=1337, random_state=0), n_jobs=1)\n", "\n", "multi_target_clf.fit(X_train, y_train)\n", "\n", @@ -587,7 +589,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "e2ebea6945193e07", "metadata": {}, "outputs": [ @@ -597,23 +599,23 @@ "text": [ " precision recall f1-score support\n", "\n", - " 0 0.78 0.91 0.84 300\n", - " 1 0.78 0.62 0.69 216\n", - " 2 1.00 0.03 0.07 86\n", - " 3 0.00 0.00 0.00 46\n", - " 4 1.00 0.04 0.07 83\n", - " 5 0.79 0.81 0.80 245\n", - " 6 0.00 0.00 0.00 42\n", - " 7 0.90 0.34 0.49 127\n", - " 8 0.00 0.00 0.00 12\n", - " 9 0.89 0.25 0.39 127\n", - " 10 0.00 0.00 0.00 14\n", - " 11 0.88 0.14 0.24 106\n", + " 0 0.84 0.86 0.85 300\n", + " 1 0.74 0.63 0.68 216\n", + " 2 0.77 0.31 0.45 86\n", + " 3 0.50 0.04 0.08 46\n", + " 4 0.69 0.33 0.44 83\n", + " 5 0.79 0.80 0.79 245\n", + " 6 0.69 0.26 0.38 42\n", + " 7 0.74 0.62 0.68 127\n", + " 8 1.00 0.67 0.80 12\n", + " 9 0.80 0.57 0.67 127\n", + " 10 1.00 0.50 0.67 14\n", + " 11 0.79 0.46 0.58 106\n", "\n", - " micro avg 0.79 0.50 0.61 1404\n", - " macro avg 0.58 0.26 0.30 1404\n", - "weighted avg 0.77 0.50 0.53 1404\n", - " samples avg 0.77 0.56 0.60 1404\n", + " micro avg 0.79 0.62 0.69 1404\n", + " macro avg 0.78 0.51 0.59 1404\n", + "weighted avg 0.77 0.62 0.67 1404\n", + " samples avg 0.80 0.68 0.70 1404\n", "\n" ] } @@ -633,15 +635,6 @@ "**TODO optimize the model based on the test results**" ] }, - { - "cell_type": "markdown", - "id": "79b20645", - "metadata": {}, - "source": [ - "# Validation\n", - "**TODO Predict actual values**" - ] - }, { "cell_type": "markdown", "id": "3b709fb7",