diff --git a/compare_dataset_sizes.png b/compare_dataset_sizes.png new file mode 100644 index 0000000..0f8cbbb Binary files /dev/null and b/compare_dataset_sizes.png differ diff --git a/plot_maker.py b/compare_dataset_sizes.py similarity index 97% rename from plot_maker.py rename to compare_dataset_sizes.py index 331ae94..251d29b 100644 --- a/plot_maker.py +++ b/compare_dataset_sizes.py @@ -31,7 +31,7 @@ plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.25, label="cleane plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k") plt.xticks(x, models, rotation=45) -plt.ylabel("Weighted F1-Score") +plt.ylabel("F1-Score") plt.title("Model Performance across Datasets") plt.legend() plt.tight_layout() diff --git a/compare_models_2k.png b/compare_models_2k.png new file mode 100644 index 0000000..d85904f Binary files /dev/null and b/compare_models_2k.png differ diff --git a/games_march2025_cleaned/BernoulliNB.txt b/games_march2025_cleaned/BernoulliNB.txt deleted file mode 100644 index f2237d4..0000000 --- a/games_march2025_cleaned/BernoulliNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.75 0.90 0.82 300 - 1 0.72 0.68 0.70 216 - 2 0.50 0.08 0.14 86 - 3 0.27 0.07 0.11 46 - 4 0.40 0.07 0.12 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.82 0.79 245 - 7 0.33 0.10 0.15 42 - 8 0.67 0.40 0.50 127 - 9 0.00 0.00 0.00 12 - 10 0.71 0.37 0.49 127 - 11 0.00 0.00 0.00 14 - 12 0.49 0.31 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.70 0.55 0.62 1404 - macro avg 0.40 0.27 0.30 1404 -weighted avg 0.64 0.55 0.56 1404 - samples avg 0.73 0.59 0.61 1404 diff --git a/games_march2025_cleaned/DecisionTreeClassifier.txt b/games_march2025_cleaned/DecisionTreeClassifier.txt deleted file mode 100644 index 900c256..0000000 --- a/games_march2025_cleaned/DecisionTreeClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.73 0.75 300 - 1 0.56 0.53 0.54 216 - 2 0.36 0.33 0.34 86 - 3 0.33 0.26 0.29 46 - 4 0.40 0.46 0.43 83 - 5 0.00 0.00 0.00 0 - 6 0.65 0.61 0.63 245 - 7 0.39 0.40 0.40 42 - 8 0.59 0.57 0.58 127 - 9 0.60 0.25 0.35 12 - 10 0.56 0.51 0.53 127 - 11 0.39 0.50 0.44 14 - 12 0.52 0.49 0.50 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.58 0.55 0.57 1404 - macro avg 0.44 0.40 0.41 1404 -weighted avg 0.58 0.55 0.57 1404 - samples avg 0.59 0.59 0.55 1404 diff --git a/games_march2025_cleaned/GaussianNB.txt b/games_march2025_cleaned/GaussianNB.txt deleted file mode 100644 index 83d7a2e..0000000 --- a/games_march2025_cleaned/GaussianNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.80 0.78 300 - 1 0.62 0.51 0.56 216 - 2 0.63 0.14 0.23 86 - 3 0.17 0.02 0.04 46 - 4 0.42 0.10 0.16 83 - 5 0.00 0.00 0.00 0 - 6 0.68 0.66 0.67 245 - 7 0.56 0.12 0.20 42 - 8 0.55 0.33 0.41 127 - 9 0.67 0.17 0.27 12 - 10 0.65 0.31 0.42 127 - 11 1.00 0.14 0.25 14 - 12 0.53 0.29 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.66 0.47 0.55 1404 - macro avg 0.52 0.26 0.31 1404 -weighted avg 0.62 0.47 0.51 1404 - samples avg 0.67 0.53 0.55 1404 diff --git a/games_march2025_cleaned/GradientBoostingClassifier.txt b/games_march2025_cleaned/GradientBoostingClassifier.txt deleted file mode 100644 index 7c8ce6e..0000000 --- a/games_march2025_cleaned/GradientBoostingClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.80 0.83 300 - 1 0.77 0.61 0.68 216 - 2 0.55 0.13 0.21 86 - 3 0.42 0.11 0.17 46 - 4 0.68 0.33 0.44 83 - 5 0.00 0.00 0.00 0 - 6 0.71 0.76 0.74 245 - 7 0.61 0.26 0.37 42 - 8 0.81 0.50 0.61 127 - 9 0.75 0.25 0.38 12 - 10 0.81 0.54 0.65 127 - 11 0.40 0.43 0.41 14 - 12 0.69 0.42 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.76 0.57 0.65 1404 - macro avg 0.57 0.37 0.43 1404 -weighted avg 0.74 0.57 0.63 1404 - samples avg 0.76 0.63 0.65 1404 diff --git a/games_march2025_cleaned/LinearSVC-i5000.txt b/games_march2025_cleaned/LinearSVC-i5000.txt deleted file mode 100644 index df82b40..0000000 --- a/games_march2025_cleaned/LinearSVC-i5000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.87 0.86 300 - 1 0.76 0.66 0.70 216 - 2 0.77 0.20 0.31 86 - 3 0.00 0.00 0.00 46 - 4 0.76 0.27 0.39 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.81 0.79 245 - 7 0.89 0.19 0.31 42 - 8 0.77 0.60 0.67 127 - 9 1.00 0.58 0.74 12 - 10 0.85 0.54 0.66 127 - 11 1.00 0.29 0.44 14 - 12 0.82 0.42 0.56 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.61 0.69 1404 - macro avg 0.66 0.39 0.46 1404 -weighted avg 0.78 0.61 0.66 1404 - samples avg 0.81 0.67 0.69 1404 diff --git a/games_march2025_cleaned/LogisticRegression-i1000.txt b/games_march2025_cleaned/LogisticRegression-i1000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned/LogisticRegression-i1000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned/LogisticRegression-i10000.txt b/games_march2025_cleaned/LogisticRegression-i10000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned/LogisticRegression-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned/MLPClassifier-i10000.txt b/games_march2025_cleaned/MLPClassifier-i10000.txt deleted file mode 100644 index c4634dc..0000000 --- a/games_march2025_cleaned/MLPClassifier-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.84 0.85 0.84 300 - 1 0.73 0.67 0.70 216 - 2 0.74 0.30 0.43 86 - 3 0.50 0.02 0.04 46 - 4 0.69 0.24 0.36 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.79 0.79 245 - 7 0.86 0.14 0.24 42 - 8 0.76 0.63 0.69 127 - 9 1.00 0.33 0.50 12 - 10 0.81 0.52 0.63 127 - 11 1.00 0.14 0.25 14 - 12 0.75 0.41 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.60 0.68 1404 - macro avg 0.68 0.36 0.43 1404 -weighted avg 0.78 0.60 0.65 1404 - samples avg 0.80 0.66 0.68 1404 diff --git a/games_march2025_cleaned/MultinomialNB.txt b/games_march2025_cleaned/MultinomialNB.txt deleted file mode 100644 index bc74cf3..0000000 --- a/games_march2025_cleaned/MultinomialNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.64 0.99 0.78 300 - 1 0.85 0.24 0.37 216 - 2 0.60 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 0.80 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.80 0.79 245 - 7 0.40 0.05 0.09 42 - 8 1.00 0.04 0.08 127 - 9 0.00 0.00 0.00 12 - 10 0.20 0.01 0.02 127 - 11 0.00 0.00 0.00 14 - 12 1.00 0.05 0.09 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.69 0.40 0.51 1404 - macro avg 0.45 0.16 0.17 1404 -weighted avg 0.68 0.40 0.39 1404 - samples avg 0.70 0.44 0.50 1404 diff --git a/games_march2025_cleaned/RandomForestClassifier.txt b/games_march2025_cleaned/RandomForestClassifier.txt deleted file mode 100644 index 6fbe546..0000000 --- a/games_march2025_cleaned/RandomForestClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.80 0.88 0.84 300 - 1 0.78 0.55 0.64 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.06 0.11 83 - 5 0.00 0.00 0.00 0 - 6 0.74 0.78 0.76 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.24 0.38 127 - 9 0.00 0.00 0.00 12 - 10 0.91 0.24 0.38 127 - 11 1.00 0.14 0.25 14 - 12 1.00 0.25 0.39 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.48 0.59 1404 - macro avg 0.58 0.23 0.27 1404 -weighted avg 0.78 0.48 0.52 1404 - samples avg 0.77 0.54 0.60 1404 diff --git a/games_march2025_cleaned/SVC-RBF-i10000.txt b/games_march2025_cleaned/SVC-RBF-i10000.txt deleted file mode 100644 index ff0c7b7..0000000 --- a/games_march2025_cleaned/SVC-RBF-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.81 0.90 0.85 300 - 1 0.76 0.63 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.83 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.40 0.54 127 - 9 1.00 0.17 0.29 12 - 10 0.90 0.34 0.49 127 - 11 1.00 0.14 0.25 14 - 12 0.92 0.21 0.34 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.53 0.63 1404 - macro avg 0.64 0.26 0.32 1404 -weighted avg 0.79 0.53 0.56 1404 - samples avg 0.79 0.59 0.63 1404 diff --git a/games_march2025_cleaned_10k/BernoulliNB.txt b/games_march2025_cleaned_10k/BernoulliNB.txt deleted file mode 100644 index f2237d4..0000000 --- a/games_march2025_cleaned_10k/BernoulliNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.75 0.90 0.82 300 - 1 0.72 0.68 0.70 216 - 2 0.50 0.08 0.14 86 - 3 0.27 0.07 0.11 46 - 4 0.40 0.07 0.12 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.82 0.79 245 - 7 0.33 0.10 0.15 42 - 8 0.67 0.40 0.50 127 - 9 0.00 0.00 0.00 12 - 10 0.71 0.37 0.49 127 - 11 0.00 0.00 0.00 14 - 12 0.49 0.31 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.70 0.55 0.62 1404 - macro avg 0.40 0.27 0.30 1404 -weighted avg 0.64 0.55 0.56 1404 - samples avg 0.73 0.59 0.61 1404 diff --git a/games_march2025_cleaned_10k/DecisionTreeClassifier.txt b/games_march2025_cleaned_10k/DecisionTreeClassifier.txt deleted file mode 100644 index 900c256..0000000 --- a/games_march2025_cleaned_10k/DecisionTreeClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.73 0.75 300 - 1 0.56 0.53 0.54 216 - 2 0.36 0.33 0.34 86 - 3 0.33 0.26 0.29 46 - 4 0.40 0.46 0.43 83 - 5 0.00 0.00 0.00 0 - 6 0.65 0.61 0.63 245 - 7 0.39 0.40 0.40 42 - 8 0.59 0.57 0.58 127 - 9 0.60 0.25 0.35 12 - 10 0.56 0.51 0.53 127 - 11 0.39 0.50 0.44 14 - 12 0.52 0.49 0.50 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.58 0.55 0.57 1404 - macro avg 0.44 0.40 0.41 1404 -weighted avg 0.58 0.55 0.57 1404 - samples avg 0.59 0.59 0.55 1404 diff --git a/games_march2025_cleaned_10k/GaussianNB.txt b/games_march2025_cleaned_10k/GaussianNB.txt deleted file mode 100644 index 83d7a2e..0000000 --- a/games_march2025_cleaned_10k/GaussianNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.80 0.78 300 - 1 0.62 0.51 0.56 216 - 2 0.63 0.14 0.23 86 - 3 0.17 0.02 0.04 46 - 4 0.42 0.10 0.16 83 - 5 0.00 0.00 0.00 0 - 6 0.68 0.66 0.67 245 - 7 0.56 0.12 0.20 42 - 8 0.55 0.33 0.41 127 - 9 0.67 0.17 0.27 12 - 10 0.65 0.31 0.42 127 - 11 1.00 0.14 0.25 14 - 12 0.53 0.29 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.66 0.47 0.55 1404 - macro avg 0.52 0.26 0.31 1404 -weighted avg 0.62 0.47 0.51 1404 - samples avg 0.67 0.53 0.55 1404 diff --git a/games_march2025_cleaned_10k/GradientBoostingClassifier.txt b/games_march2025_cleaned_10k/GradientBoostingClassifier.txt deleted file mode 100644 index 7c8ce6e..0000000 --- a/games_march2025_cleaned_10k/GradientBoostingClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.80 0.83 300 - 1 0.77 0.61 0.68 216 - 2 0.55 0.13 0.21 86 - 3 0.42 0.11 0.17 46 - 4 0.68 0.33 0.44 83 - 5 0.00 0.00 0.00 0 - 6 0.71 0.76 0.74 245 - 7 0.61 0.26 0.37 42 - 8 0.81 0.50 0.61 127 - 9 0.75 0.25 0.38 12 - 10 0.81 0.54 0.65 127 - 11 0.40 0.43 0.41 14 - 12 0.69 0.42 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.76 0.57 0.65 1404 - macro avg 0.57 0.37 0.43 1404 -weighted avg 0.74 0.57 0.63 1404 - samples avg 0.76 0.63 0.65 1404 diff --git a/games_march2025_cleaned_10k/LinearSVC-i5000.txt b/games_march2025_cleaned_10k/LinearSVC-i5000.txt deleted file mode 100644 index df82b40..0000000 --- a/games_march2025_cleaned_10k/LinearSVC-i5000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.87 0.86 300 - 1 0.76 0.66 0.70 216 - 2 0.77 0.20 0.31 86 - 3 0.00 0.00 0.00 46 - 4 0.76 0.27 0.39 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.81 0.79 245 - 7 0.89 0.19 0.31 42 - 8 0.77 0.60 0.67 127 - 9 1.00 0.58 0.74 12 - 10 0.85 0.54 0.66 127 - 11 1.00 0.29 0.44 14 - 12 0.82 0.42 0.56 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.61 0.69 1404 - macro avg 0.66 0.39 0.46 1404 -weighted avg 0.78 0.61 0.66 1404 - samples avg 0.81 0.67 0.69 1404 diff --git a/games_march2025_cleaned_10k/LogisticRegression-i1000.txt b/games_march2025_cleaned_10k/LogisticRegression-i1000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned_10k/LogisticRegression-i1000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned_10k/LogisticRegression-i10000.txt b/games_march2025_cleaned_10k/LogisticRegression-i10000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned_10k/LogisticRegression-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned_10k/MLPClassifier-i10000.txt b/games_march2025_cleaned_10k/MLPClassifier-i10000.txt deleted file mode 100644 index c4634dc..0000000 --- a/games_march2025_cleaned_10k/MLPClassifier-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.84 0.85 0.84 300 - 1 0.73 0.67 0.70 216 - 2 0.74 0.30 0.43 86 - 3 0.50 0.02 0.04 46 - 4 0.69 0.24 0.36 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.79 0.79 245 - 7 0.86 0.14 0.24 42 - 8 0.76 0.63 0.69 127 - 9 1.00 0.33 0.50 12 - 10 0.81 0.52 0.63 127 - 11 1.00 0.14 0.25 14 - 12 0.75 0.41 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.60 0.68 1404 - macro avg 0.68 0.36 0.43 1404 -weighted avg 0.78 0.60 0.65 1404 - samples avg 0.80 0.66 0.68 1404 diff --git a/games_march2025_cleaned_10k/MultinomialNB.txt b/games_march2025_cleaned_10k/MultinomialNB.txt deleted file mode 100644 index bc74cf3..0000000 --- a/games_march2025_cleaned_10k/MultinomialNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.64 0.99 0.78 300 - 1 0.85 0.24 0.37 216 - 2 0.60 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 0.80 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.80 0.79 245 - 7 0.40 0.05 0.09 42 - 8 1.00 0.04 0.08 127 - 9 0.00 0.00 0.00 12 - 10 0.20 0.01 0.02 127 - 11 0.00 0.00 0.00 14 - 12 1.00 0.05 0.09 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.69 0.40 0.51 1404 - macro avg 0.45 0.16 0.17 1404 -weighted avg 0.68 0.40 0.39 1404 - samples avg 0.70 0.44 0.50 1404 diff --git a/games_march2025_cleaned_10k/RandomForestClassifier.txt b/games_march2025_cleaned_10k/RandomForestClassifier.txt deleted file mode 100644 index 6fbe546..0000000 --- a/games_march2025_cleaned_10k/RandomForestClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.80 0.88 0.84 300 - 1 0.78 0.55 0.64 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.06 0.11 83 - 5 0.00 0.00 0.00 0 - 6 0.74 0.78 0.76 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.24 0.38 127 - 9 0.00 0.00 0.00 12 - 10 0.91 0.24 0.38 127 - 11 1.00 0.14 0.25 14 - 12 1.00 0.25 0.39 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.48 0.59 1404 - macro avg 0.58 0.23 0.27 1404 -weighted avg 0.78 0.48 0.52 1404 - samples avg 0.77 0.54 0.60 1404 diff --git a/games_march2025_cleaned_10k/SVC-RBF-i10000.txt b/games_march2025_cleaned_10k/SVC-RBF-i10000.txt deleted file mode 100644 index ff0c7b7..0000000 --- a/games_march2025_cleaned_10k/SVC-RBF-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.81 0.90 0.85 300 - 1 0.76 0.63 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.83 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.40 0.54 127 - 9 1.00 0.17 0.29 12 - 10 0.90 0.34 0.49 127 - 11 1.00 0.14 0.25 14 - 12 0.92 0.21 0.34 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.53 0.63 1404 - macro avg 0.64 0.26 0.32 1404 -weighted avg 0.79 0.53 0.56 1404 - samples avg 0.79 0.59 0.63 1404 diff --git a/games_march2025_cleaned_2k/BernoulliNB.txt b/games_march2025_cleaned_2k/BernoulliNB.txt deleted file mode 100644 index f2237d4..0000000 --- a/games_march2025_cleaned_2k/BernoulliNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.75 0.90 0.82 300 - 1 0.72 0.68 0.70 216 - 2 0.50 0.08 0.14 86 - 3 0.27 0.07 0.11 46 - 4 0.40 0.07 0.12 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.82 0.79 245 - 7 0.33 0.10 0.15 42 - 8 0.67 0.40 0.50 127 - 9 0.00 0.00 0.00 12 - 10 0.71 0.37 0.49 127 - 11 0.00 0.00 0.00 14 - 12 0.49 0.31 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.70 0.55 0.62 1404 - macro avg 0.40 0.27 0.30 1404 -weighted avg 0.64 0.55 0.56 1404 - samples avg 0.73 0.59 0.61 1404 diff --git a/games_march2025_cleaned_2k/DecisionTreeClassifier.txt b/games_march2025_cleaned_2k/DecisionTreeClassifier.txt deleted file mode 100644 index 900c256..0000000 --- a/games_march2025_cleaned_2k/DecisionTreeClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.73 0.75 300 - 1 0.56 0.53 0.54 216 - 2 0.36 0.33 0.34 86 - 3 0.33 0.26 0.29 46 - 4 0.40 0.46 0.43 83 - 5 0.00 0.00 0.00 0 - 6 0.65 0.61 0.63 245 - 7 0.39 0.40 0.40 42 - 8 0.59 0.57 0.58 127 - 9 0.60 0.25 0.35 12 - 10 0.56 0.51 0.53 127 - 11 0.39 0.50 0.44 14 - 12 0.52 0.49 0.50 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.58 0.55 0.57 1404 - macro avg 0.44 0.40 0.41 1404 -weighted avg 0.58 0.55 0.57 1404 - samples avg 0.59 0.59 0.55 1404 diff --git a/games_march2025_cleaned_2k/GaussianNB.txt b/games_march2025_cleaned_2k/GaussianNB.txt deleted file mode 100644 index 83d7a2e..0000000 --- a/games_march2025_cleaned_2k/GaussianNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.76 0.80 0.78 300 - 1 0.62 0.51 0.56 216 - 2 0.63 0.14 0.23 86 - 3 0.17 0.02 0.04 46 - 4 0.42 0.10 0.16 83 - 5 0.00 0.00 0.00 0 - 6 0.68 0.66 0.67 245 - 7 0.56 0.12 0.20 42 - 8 0.55 0.33 0.41 127 - 9 0.67 0.17 0.27 12 - 10 0.65 0.31 0.42 127 - 11 1.00 0.14 0.25 14 - 12 0.53 0.29 0.38 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.66 0.47 0.55 1404 - macro avg 0.52 0.26 0.31 1404 -weighted avg 0.62 0.47 0.51 1404 - samples avg 0.67 0.53 0.55 1404 diff --git a/games_march2025_cleaned_2k/GradientBoostingClassifier.txt b/games_march2025_cleaned_2k/GradientBoostingClassifier.txt deleted file mode 100644 index 7c8ce6e..0000000 --- a/games_march2025_cleaned_2k/GradientBoostingClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.80 0.83 300 - 1 0.77 0.61 0.68 216 - 2 0.55 0.13 0.21 86 - 3 0.42 0.11 0.17 46 - 4 0.68 0.33 0.44 83 - 5 0.00 0.00 0.00 0 - 6 0.71 0.76 0.74 245 - 7 0.61 0.26 0.37 42 - 8 0.81 0.50 0.61 127 - 9 0.75 0.25 0.38 12 - 10 0.81 0.54 0.65 127 - 11 0.40 0.43 0.41 14 - 12 0.69 0.42 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.76 0.57 0.65 1404 - macro avg 0.57 0.37 0.43 1404 -weighted avg 0.74 0.57 0.63 1404 - samples avg 0.76 0.63 0.65 1404 diff --git a/games_march2025_cleaned_2k/LinearSVC-i5000.txt b/games_march2025_cleaned_2k/LinearSVC-i5000.txt deleted file mode 100644 index df82b40..0000000 --- a/games_march2025_cleaned_2k/LinearSVC-i5000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.85 0.87 0.86 300 - 1 0.76 0.66 0.70 216 - 2 0.77 0.20 0.31 86 - 3 0.00 0.00 0.00 46 - 4 0.76 0.27 0.39 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.81 0.79 245 - 7 0.89 0.19 0.31 42 - 8 0.77 0.60 0.67 127 - 9 1.00 0.58 0.74 12 - 10 0.85 0.54 0.66 127 - 11 1.00 0.29 0.44 14 - 12 0.82 0.42 0.56 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.61 0.69 1404 - macro avg 0.66 0.39 0.46 1404 -weighted avg 0.78 0.61 0.66 1404 - samples avg 0.81 0.67 0.69 1404 diff --git a/games_march2025_cleaned_2k/LogisticRegression-i1000.txt b/games_march2025_cleaned_2k/LogisticRegression-i1000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned_2k/LogisticRegression-i1000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned_2k/LogisticRegression-i10000.txt b/games_march2025_cleaned_2k/LogisticRegression-i10000.txt deleted file mode 100644 index b7926d4..0000000 --- a/games_march2025_cleaned_2k/LogisticRegression-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.78 0.91 0.84 300 - 1 0.78 0.62 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.04 0.07 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.81 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.90 0.34 0.49 127 - 9 0.00 0.00 0.00 12 - 10 0.89 0.25 0.39 127 - 11 0.00 0.00 0.00 14 - 12 0.88 0.14 0.24 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.50 0.61 1404 - macro avg 0.50 0.22 0.26 1404 -weighted avg 0.77 0.50 0.53 1404 - samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned_2k/MLPClassifier-i10000.txt b/games_march2025_cleaned_2k/MLPClassifier-i10000.txt deleted file mode 100644 index c4634dc..0000000 --- a/games_march2025_cleaned_2k/MLPClassifier-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.84 0.85 0.84 300 - 1 0.73 0.67 0.70 216 - 2 0.74 0.30 0.43 86 - 3 0.50 0.02 0.04 46 - 4 0.69 0.24 0.36 83 - 5 0.00 0.00 0.00 0 - 6 0.79 0.79 0.79 245 - 7 0.86 0.14 0.24 42 - 8 0.76 0.63 0.69 127 - 9 1.00 0.33 0.50 12 - 10 0.81 0.52 0.63 127 - 11 1.00 0.14 0.25 14 - 12 0.75 0.41 0.53 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.60 0.68 1404 - macro avg 0.68 0.36 0.43 1404 -weighted avg 0.78 0.60 0.65 1404 - samples avg 0.80 0.66 0.68 1404 diff --git a/games_march2025_cleaned_2k/MultinomialNB.txt b/games_march2025_cleaned_2k/MultinomialNB.txt deleted file mode 100644 index bc74cf3..0000000 --- a/games_march2025_cleaned_2k/MultinomialNB.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.64 0.99 0.78 300 - 1 0.85 0.24 0.37 216 - 2 0.60 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 0.80 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.78 0.80 0.79 245 - 7 0.40 0.05 0.09 42 - 8 1.00 0.04 0.08 127 - 9 0.00 0.00 0.00 12 - 10 0.20 0.01 0.02 127 - 11 0.00 0.00 0.00 14 - 12 1.00 0.05 0.09 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.69 0.40 0.51 1404 - macro avg 0.45 0.16 0.17 1404 -weighted avg 0.68 0.40 0.39 1404 - samples avg 0.70 0.44 0.50 1404 diff --git a/games_march2025_cleaned_2k/RandomForestClassifier.txt b/games_march2025_cleaned_2k/RandomForestClassifier.txt deleted file mode 100644 index 6fbe546..0000000 --- a/games_march2025_cleaned_2k/RandomForestClassifier.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.80 0.88 0.84 300 - 1 0.78 0.55 0.64 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.06 0.11 83 - 5 0.00 0.00 0.00 0 - 6 0.74 0.78 0.76 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.24 0.38 127 - 9 0.00 0.00 0.00 12 - 10 0.91 0.24 0.38 127 - 11 1.00 0.14 0.25 14 - 12 1.00 0.25 0.39 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.79 0.48 0.59 1404 - macro avg 0.58 0.23 0.27 1404 -weighted avg 0.78 0.48 0.52 1404 - samples avg 0.77 0.54 0.60 1404 diff --git a/games_march2025_cleaned_2k/SVC-RBF-i10000.txt b/games_march2025_cleaned_2k/SVC-RBF-i10000.txt deleted file mode 100644 index ff0c7b7..0000000 --- a/games_march2025_cleaned_2k/SVC-RBF-i10000.txt +++ /dev/null @@ -1,21 +0,0 @@ - precision recall f1-score support - - 0 0.81 0.90 0.85 300 - 1 0.76 0.63 0.69 216 - 2 1.00 0.03 0.07 86 - 3 0.00 0.00 0.00 46 - 4 1.00 0.05 0.09 83 - 5 0.00 0.00 0.00 0 - 6 0.77 0.83 0.80 245 - 7 0.00 0.00 0.00 42 - 8 0.84 0.40 0.54 127 - 9 1.00 0.17 0.29 12 - 10 0.90 0.34 0.49 127 - 11 1.00 0.14 0.25 14 - 12 0.92 0.21 0.34 106 - 13 0.00 0.00 0.00 0 - - micro avg 0.80 0.53 0.63 1404 - macro avg 0.64 0.26 0.32 1404 -weighted avg 0.79 0.53 0.56 1404 - samples avg 0.79 0.59 0.63 1404 diff --git a/comparison.py b/generate_compare_dataset.py similarity index 65% rename from comparison.py rename to generate_compare_dataset.py index fcced39..d5cd4d5 100644 --- a/comparison.py +++ b/generate_compare_dataset.py @@ -4,12 +4,8 @@ import pandas as pd from sklearn import set_config from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import FunctionTransformer - -from sklearn.preprocessing import MultiLabelBinarizer +from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer import ast - - from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.multioutput import MultiOutputClassifier @@ -20,15 +16,19 @@ from sklearn.metrics import accuracy_score, classification_report from sklearn.svm import SVC, LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier -from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB -from sklearn.neighbors import KNeighborsClassifier +from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, SGDClassifier +from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, StackingClassifier +from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.dummy import DummyClassifier from sklearn.neural_network import MLPClassifier - set_config(transform_output="pandas") # dataframe supremacy - def prepDataset(dataset): #returns X_train, X_test, y_train, y_test - dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",") + dataset = pd.read_csv(dataset,sep=",") # desc, genres, tags column_transformer = ColumnTransformer([ # merge all descriptions @@ -39,9 +39,6 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test verbose_feature_names_out=False ) dataset = column_transformer.fit_transform(dataset) - - - #### SET MISSING VALUES print("SETMISS") # Setting missing numeric values to the mean @@ -50,36 +47,26 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test dataset.fillna('', inplace=True) # Setting missing values in other columns to NaN dataset.dropna(inplace=True) - ##### STRUCTURIZE GENRES to onehot #serialize array dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) #print(dataset['genres']) # in py but not yet onehotenc - # MultiLabelBinarizer does onehotenc for arrays mlb_genres = MultiLabelBinarizer() genres_encoded = mlb_genres.fit_transform(dataset.pop('genres')) #genres_count = len(mlb_genres.classes_) # for multi-label classifiction later - genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_) #print(genres_df) #dataset = pd.concat([dataset, genres_df], axis=1) #print(dataset) - - #### convert text to bag of words - ## Count vs Tfidf vectorizer vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()) #print(tfidf_df) - - ##### MODEL print("MODEL") - - X = tfidf_df y = genres_df # cleanup datapoints that dont have a target value (all target columns are 0) @@ -87,50 +74,63 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test #print((mask == False).sum()) #31 cases with all target columns 0 X_clean = X[mask] y_clean = y[mask] - # Split dataset return train_test_split(X_clean, y_clean, random_state=0) - def comparison(X_train, X_test, y_train, y_test, estimator, jobs: int = 1): #returns class_report multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0) - # model training multi_target_clf.fit(X_train, y_train) - # predict against test data y_pred = multi_target_clf.predict(X_test) return classification_report(y_test, y_pred, zero_division=0.0) - datasets = [ 'games_march2025_cleaned_2k.csv', - 'games_march2025_cleaned_10k.csv', - 'games_march2025_cleaned.csv' + #'games_march2025_cleaned_10k.csv', + #'games_march2025_cleaned.csv' ] +max_iter = 3000 # <-- set your desired value here + estimators = { - "LogisticRegression-i1000": LogisticRegression(max_iter=1000, random_state=0), - "LogisticRegression-i10000": LogisticRegression(max_iter=10000, random_state=0), - "LinearSVC-i5000": LinearSVC(max_iter=5000), - "SVC-RBF-i10000": SVC(kernel="rbf", max_iter=10000), + "LogisticRegression": LogisticRegression(random_state=0, max_iter=max_iter), + "RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter), + "PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter), + "Perceptron": Perceptron(random_state=0, max_iter=max_iter), + "SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter), + "KNeighborsClassifier": KNeighborsClassifier(), + "NearestCentroid": NearestCentroid(), + "RadiusNeighborsClassifier": RadiusNeighborsClassifier(), + "LinearSVC-i5000": LinearSVC(random_state=0, max_iter=max_iter), + "SVC": SVC(random_state=0, max_iter=max_iter), "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0), "RandomForestClassifier": RandomForestClassifier(random_state=0), + "ExtraTreesClassifier": ExtraTreesClassifier(random_state=0), + "BaggingClassifier": BaggingClassifier(random_state=0), + "AdaBoostClassifier": AdaBoostClassifier(random_state=0), "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0), + "HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter), "GaussianNB": GaussianNB(), "MultinomialNB": MultinomialNB(), "BernoulliNB": BernoulliNB(), + "ComplementNB": ComplementNB(), + "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(), + "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(), "MLPClassifier-i10000": MLPClassifier(max_iter=10000, random_state=0), + "DummyClassifier": DummyClassifier(random_state=0) } +#"VotingClassifier": VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]), +#"StackingClassifier": StackingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]), for dataset in datasets: print("-" * 60) print("dataset -> " + dataset) - print("-" * 60) print("mkdir") folder = dataset.split(".csv")[0] if not os.path.isdir(folder): os.mkdir(folder) X_train, X_test, y_train, y_test = prepDataset(dataset) for esti in estimators: + print("model: " + esti) compari = comparison(X_train, X_test, y_train, y_test, estimators[esti], 1) #TODO: change the job count if you can print("open") f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8") diff --git a/notebook.ipynb b/notebook.ipynb index 3307ceb..d1afe52 100644 --- a/notebook.ipynb +++ b/notebook.ipynb @@ -23,36 +23,7 @@ "is_executing": true } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " appid name release_date required_age price dlc_count \\\n", - "0 730 Counter-Strike 2 2012-08-21 0 0.0 1 \n", - "\n", - " detailed_description \\\n", - "0 For over two decades, Counter-Strike has offer... \n", - "\n", - " about_the_game \\\n", - "0 For over two decades, Counter-Strike has offer... \n", - "\n", - " short_description reviews ... \\\n", - "0 For over two decades, Counter-Strike has offer... NaN ... \n", - "\n", - " average_playtime_2weeks median_playtime_forever median_playtime_2weeks \\\n", - "0 879 5174 350 \n", - "\n", - " discount peak_ccu tags \\\n", - "0 0 1212356 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... \n", - "\n", - " pct_pos_total num_reviews_total pct_pos_recent num_reviews_recent \n", - "0 86 8632939 82 96473 \n", - "\n", - "[1 rows x 47 columns]\n" - ] - } - ], + "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", @@ -120,27 +91,7 @@ "is_executing": true } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " desc \\\n", - "0 For over two decades, Counter-Strike has offer... \n", - "1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n", - "2 The most-played game on Steam. Every day, mill... \n", - "3 When a young street hustler, a retired bank ro... \n", - "4 Edition Comparison Ultimate Edition The Tom Cl... \n", - "\n", - " genres \n", - "0 ['Action', 'Free To Play'] \n", - "1 ['Action', 'Adventure', 'Massively Multiplayer... \n", - "2 ['Action', 'Strategy', 'Free To Play'] \n", - "3 ['Action', 'Adventure'] \n", - "4 ['Action'] \n" - ] - } - ], + "outputs": [], "source": [ "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import FunctionTransformer\n", @@ -200,20 +151,7 @@ "execution_count": null, "id": "ebc5a24e9bc87fdd", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 [Action, Free To Play]\n", - "1 [Action, Adventure, Massively Multiplayer, Fre...\n", - "2 [Action, Strategy, Free To Play]\n", - "3 [Action, Adventure]\n", - "4 [Action]\n", - "Name: genres, dtype: object\n" - ] - } - ], + "outputs": [], "source": [ "import ast\n", "\n", @@ -236,27 +174,7 @@ "execution_count": null, "id": "d2c3527a5fc876bf", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Action Adventure Casual Early Access Free To Play Gore Indie \\\n", - "0 1 0 0 0 1 0 0 \n", - "1 1 1 0 0 1 0 0 \n", - "2 1 0 0 0 1 0 0 \n", - "3 1 1 0 0 0 0 0 \n", - "4 1 0 0 0 0 0 0 \n", - "\n", - " Massively Multiplayer RPG Racing Simulation Sports Strategy Violent \n", - "0 0 0 0 0 0 0 0 \n", - "1 1 0 0 0 0 0 0 \n", - "2 0 0 0 0 0 1 0 \n", - "3 0 0 0 0 0 0 0 \n", - "4 0 0 0 0 0 0 0 \n" - ] - } - ], + "outputs": [], "source": [ "from sklearn.preprocessing import MultiLabelBinarizer\n", "\n", @@ -288,29 +206,7 @@ "execution_count": null, "id": "4e8b407c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n", - "0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", - "2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", - "3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", - "4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", - "\n", - " 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n", - "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "\n", - "[5 rows x 29351 columns]\n" - ] - } - ], + "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", @@ -356,15 +252,7 @@ "execution_count": null, "id": "4919bf1b37d171a7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "13\n" - ] - } - ], + "outputs": [], "source": [ "mask = y.sum(axis=1).map(lambda x: x > 0)\n", "print((mask == False).sum()) # count of unpredictable datapoints\n", @@ -399,12 +287,38 @@ "X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)" ] }, + { + "cell_type": "markdown", + "id": "84f56229", + "metadata": {}, + "source": [ + "Now that all data is prepared, we need to choose a Classification Model that meets our stanadrds." + ] + }, + { + "cell_type": "markdown", + "id": "917ba82f", + "metadata": {}, + "source": [ + "# Excursion: Choosing a classification Model\n", + "``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n", + "As such, we tested many different models on the 2k Dataset and chose the 5 best performing ones for the big dataset.\n", + "\n", + "### The comparison\n", + "We won't put the comparison script in this notebook, but you can find it in the ``compare_models.py`` file and try it out yourself.\n", + "There were some rules as a baseline for comparison:\n", + "- All Hyperparameters are set to default\n", + "- All iteration limits are set to 3000\n", + "\n", + "![Comparison Image](./compare_models_2k.png)" + ] + }, { "cell_type": "markdown", "id": "12b5283d", "metadata": {}, "source": [ - "# Model Selection\n", + "## Model Selection\n", "**TODO Deciding which model to use for this task**\n", "\n", "As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LogisticRegression``" @@ -442,36 +356,7 @@ "execution_count": null, "id": "e2ebea6945193e07", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 0.78 0.91 0.84 300\n", - " 1 0.78 0.62 0.69 216\n", - " 2 1.00 0.03 0.07 86\n", - " 3 0.00 0.00 0.00 46\n", - " 4 1.00 0.04 0.07 83\n", - " 5 0.00 0.00 0.00 0\n", - " 6 0.79 0.81 0.80 245\n", - " 7 0.00 0.00 0.00 42\n", - " 8 0.90 0.34 0.49 127\n", - " 9 0.00 0.00 0.00 12\n", - " 10 0.89 0.25 0.39 127\n", - " 11 0.00 0.00 0.00 14\n", - " 12 0.88 0.14 0.24 106\n", - " 13 0.00 0.00 0.00 0\n", - "\n", - " micro avg 0.79 0.50 0.61 1404\n", - " macro avg 0.50 0.22 0.26 1404\n", - "weighted avg 0.77 0.50 0.53 1404\n", - " samples avg 0.77 0.56 0.60 1404\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "from sklearn.metrics import classification_report\n", "\n",