changes

2025-08-18 20:22:59 +02:00
parent 28df88c0bf
commit 530d312dfd
31 changed files with 725 additions and 20 deletions
--- a/compare_dataset_sizes.png
+++ b/compare_dataset_sizes.png
--- a/compare_dataset_sizes.py
+++ b/compare_dataset_sizes.py
@@ -22,17 +22,20 @@ for dataset_name, folder in datasets.items():
                        results[dataset_name][model_name] = f1_score

 # Plot
-models = sorted(results["cleaned"].keys())  # alphabetisch sortieren für gleiche Reihenfolge
+#models = sorted(results["cleaned_2k"].keys())  # alphabetisch sortieren für gleiche Reihenfolge
+models = dict(sorted(results["cleaned_2k"].items(), key=lambda i: i[1], reverse=True)) # nach values sortieren
 x = range(len(models))

 plt.figure(figsize=(12,6))
-plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned")
-plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.25, label="cleaned_2k")
-plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k")
+#plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned")
+plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.5)#, label="cleaned_2k")
+#plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k")

-plt.xticks(x, models, rotation=45)
-plt.ylabel("F1-Score")
+plt.xticks(x, models, rotation=90)
+plt.ylim(0, 1) # min max
+plt.ylabel("Weighted F1-Score")
 plt.title("Model Performance across Datasets")
-plt.legend()
+#plt.legend()
 plt.tight_layout()
+plt.savefig('compare_graph_latest.png')
 plt.show()
--- a/compare_graph_maker_3.py
+++ b/compare_graph_maker_3.py
@@ -0,0 +1,59 @@
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+
+datasets = {
+    #"cleaned": "games_march2025_cleaned",
+    #"cleaned_2k": "games_march2025_cleaned_2k",
+    #"cleaned_10k": "games_march2025_cleaned_10k"
+    "cleaned_2k": "games_march2025_cleaned_2k_i3k",
+}
+# def results
+results = {}
+
+for dataset_name, folder in datasets.items():
+    results[dataset_name] = {}
+    for filename in os.listdir(folder):
+        if filename.endswith(".txt"):
+            model_name = filename.replace(".txt", "")
+            print("model " + model_name)
+            results[dataset_name][model_name] = {}
+            with open(os.path.join(folder, filename), "r") as f:
+                for line in f:
+                    if line.strip().startswith("micro avg"):
+                        print("micro")
+                        results[dataset_name][model_name][0] = float(line.split()[4]) # micro f1
+                    if line.strip().startswith("macro avg"):
+                        print("macro")
+                        results[dataset_name][model_name][1] = float(line.split()[4]) # macro f1
+                    if line.strip().startswith("weighted avg"):
+                        print("weight")
+                        results[dataset_name][model_name][2] = float(line.split()[4]) # weighted avg f1
+
+# Plot
+#models = sorted(results["cleaned_2k"].keys())  # alphabetisch sortieren für gleiche Reihenfolge
+models = dict(sorted(results["cleaned_2k"].items(), key=lambda i: i[1][2], reverse=True)) # nach values sortieren
+print(models)
+x = range(len(models))
+
+fig = plt.figure()
+#ax = fig.add_subplot(projection='3d')
+
+plt.bar([i - 0.25 for i in x], [results["cleaned_2k"][m][0] for m in models], width=0.25, label="Micro")
+plt.bar(x,                     [results["cleaned_2k"][m][1] for m in models], width=0.25, label="Macro")
+plt.bar([i + 0.25 for i in x], [results["cleaned_2k"][m][2] for m in models], width=0.25, label="Weighted")
+
+plt.xticks(x, models, rotation=90)
+plt.ylabel("F1 Score")
+#ax.set_zlabel("F1 Value")
+plt.ylim(0,1)
+plt.title("Model Performance - 2k Dataset")
+plt.legend()
+plt.tight_layout()
+plt.savefig('compare_graph_latest_3.png')
+plt.show()
+
+# On the y-axis let's only label the discrete values that we have data for.
+#ax.set_yticks(yticks)
+
+plt.show()
--- a/compare_models_10k.py
+++ b/compare_models_10k.py
@@ -0,0 +1,126 @@
+import os
+import numpy as np
+import pandas as pd
+from sklearn import set_config
+
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer
+import ast
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.multioutput import MultiOutputClassifier
+from sklearn.metrics import classification_report
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_iris
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.svm import SVC, LinearSVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, SGDClassifier
+from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, StackingClassifier
+from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
+from sklearn.dummy import DummyClassifier
+from sklearn.neural_network import MLPClassifier
+
+set_config(transform_output="pandas") # dataframe supremacy
+
+jobs = 12
+max_iter = 3000
+
+def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
+    dataset = pd.read_csv(dataset,sep=",")
+    # desc, genres, tags
+    column_transformer = ColumnTransformer([
+            # merge all descriptions
+            ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
+                ['detailed_description', 'about_the_game', 'short_description']),
+            ('pass', 'passthrough', ['genres']),#, 'tags'
+        ],
+        verbose_feature_names_out=False
+    )
+    dataset = column_transformer.fit_transform(dataset)
+    #### SET MISSING VALUES
+    print("SETMISS")
+    # Setting missing numeric values to the mean
+    dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
+    # Setting missing text values to 'Unknown'
+    dataset.fillna('', inplace=True)
+    # Setting missing values in other columns to NaN
+    dataset.dropna(inplace=True)
+    ##### STRUCTURIZE GENRES to onehot
+    #serialize array
+    dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) 
+    #print(dataset['genres']) # in py but not yet onehotenc
+    # MultiLabelBinarizer does onehotenc for arrays
+    mlb_genres = MultiLabelBinarizer()
+    genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
+    #genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
+    genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
+    #print(genres_df)
+    #dataset = pd.concat([dataset, genres_df], axis=1)
+    #print(dataset)
+    #### convert text to bag of words
+    ## Count vs Tfidf vectorizer
+    vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
+    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
+    #print(tfidf_df)
+    ##### MODEL
+    print("MODEL")
+    X = tfidf_df
+    y = genres_df
+    # cleanup datapoints that dont have a target value (all target columns are 0)
+    mask = y.sum(axis=1).map(lambda x: x > 0)
+    #print((mask == False).sum()) #31 cases with all target columns 0
+    X_clean = X[mask]
+    y_clean = y[mask]
+    # Split dataset
+    return train_test_split(X_clean, y_clean, random_state=0)
+def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report
+    multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0)
+    # model training
+    multi_target_clf.fit(X_train, y_train)
+    # predict against test data
+    y_pred = multi_target_clf.predict(X_test)
+    return classification_report(y_test, y_pred, zero_division=0.0)
+datasets = [
+    #'games_march2025_cleaned_2k.csv',
+    'games_march2025_cleaned_10k.csv',
+    #'games_march2025_cleaned.csv'
+]
+estimators = {
+    "RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter),
+    "PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter),
+    "Perceptron": Perceptron(random_state=0, max_iter=max_iter),
+    "SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter),
+    "NearestCentroid": NearestCentroid(),
+    "LinearSVC": LinearSVC(random_state=0, max_iter=max_iter),
+    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
+    "HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter),
+    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
+    "MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/20), early_stopping=True),
+}
+
+#"VotingClassifier": VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]),
+#"StackingClassifier": StackingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]),
+for dataset in datasets:
+    print("-" * 60)
+    print("dataset -> " + dataset)
+    print("mkdir")
+    folder = dataset.split(".csv")[0]
+    if not os.path.isdir(folder):
+        os.mkdir(folder)
+    X_train, X_test, y_train, y_test = prepDataset(dataset)
+    for esti in estimators:
+        print("model: " + esti)
+        compari = comparison(X_train, X_test, y_train, y_test, estimators[esti])
+        print("open")
+        f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8")
+        f.write(compari)
+        print("write")
+        f.close()
+        print("close")
--- a/compare_models_2k.png
+++ b/compare_models_2k.png
--- a/generate_compare_dataset.py
+++ b/generate_compare_dataset.py
@@ -27,6 +27,10 @@ from sklearn.dummy import DummyClassifier
 from sklearn.neural_network import MLPClassifier

 set_config(transform_output="pandas") # dataframe supremacy
+
+jobs = 12
+max_iter = 3000
+
 def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
    dataset = pd.read_csv(dataset,sep=",")
    # desc, genres, tags
@@ -76,7 +80,7 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
    y_clean = y[mask]
    # Split dataset
    return train_test_split(X_clean, y_clean, random_state=0)
-def comparison(X_train, X_test, y_train, y_test, estimator, jobs: int = 1): #returns class_report
+def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report
    multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0)
    # model training
    multi_target_clf.fit(X_train, y_train)
@@ -88,9 +92,6 @@ datasets = [
    #'games_march2025_cleaned_10k.csv',
    #'games_march2025_cleaned.csv'
 ]
-
-max_iter = 3000  # <-- set your desired value here
-
 estimators = {
    "LogisticRegression": LogisticRegression(random_state=0, max_iter=max_iter),
    "RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter),
@@ -99,8 +100,8 @@ estimators = {
    "SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "NearestCentroid": NearestCentroid(),
-    "RadiusNeighborsClassifier": RadiusNeighborsClassifier(),
-    "LinearSVC-i5000": LinearSVC(random_state=0, max_iter=max_iter),
+    # "RadiusNeighborsClassifier": RadiusNeighborsClassifier(), # failed bcs no neighbours in range :sob:
+    "LinearSVC": LinearSVC(random_state=0, max_iter=max_iter),
    "SVC": SVC(random_state=0, max_iter=max_iter),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
    "RandomForestClassifier": RandomForestClassifier(random_state=0),
@@ -114,8 +115,7 @@ estimators = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
-    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
-    "MLPClassifier-i10000": MLPClassifier(max_iter=10000, random_state=0),
+    "MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/5), verbose=True),
    "DummyClassifier": DummyClassifier(random_state=0)
 }

@@ -131,7 +131,7 @@ for dataset in datasets:
    X_train, X_test, y_train, y_test = prepDataset(dataset)
    for esti in estimators:
        print("model: " + esti)
-        compari = comparison(X_train, X_test, y_train, y_test, estimators[esti], 1) #TODO: change the job count if you can
+        compari = comparison(X_train, X_test, y_train, y_test, estimators[esti])
        print("open")
        f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8")
        f.write(compari)
--- a/compare_models_2k_3.png
+++ b/compare_models_2k_3.png
--- a/games_march2025_cleaned_2k_i3k/AdaBoostClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/AdaBoostClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.87      0.76      0.81       300
+           1       0.70      0.59      0.64       216
+           2       0.58      0.13      0.21        86
+           3       0.56      0.11      0.18        46
+           4       0.71      0.30      0.42        83
+           5       0.00      0.00      0.00         0
+           6       0.69      0.70      0.69       245
+           7       0.62      0.31      0.41        42
+           8       0.76      0.41      0.53       127
+           9       1.00      0.50      0.67        12
+          10       0.67      0.50      0.57       127
+          11       0.40      0.29      0.33        14
+          12       0.74      0.45      0.56       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.74      0.54      0.62      1404
+   macro avg       0.59      0.36      0.43      1404
+weighted avg       0.73      0.54      0.60      1404
+ samples avg       0.74      0.59      0.61      1404
--- a/games_march2025_cleaned_2k_i3k/BaggingClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/BaggingClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.86      0.70      0.77       300
+           1       0.72      0.50      0.59       216
+           2       0.47      0.09      0.16        86
+           3       0.50      0.04      0.08        46
+           4       0.58      0.23      0.33        83
+           5       0.00      0.00      0.00         0
+           6       0.71      0.64      0.67       245
+           7       0.80      0.29      0.42        42
+           8       0.79      0.46      0.58       127
+           9       1.00      0.25      0.40        12
+          10       0.71      0.43      0.53       127
+          11       0.40      0.29      0.33        14
+          12       0.68      0.42      0.52       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.74      0.49      0.59      1404
+   macro avg       0.59      0.31      0.39      1404
+weighted avg       0.72      0.49      0.56      1404
+ samples avg       0.70      0.54      0.57      1404
--- a/games_march2025_cleaned_2k_i3k/BernoulliNB.txt
+++ b/games_march2025_cleaned_2k_i3k/BernoulliNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.75      0.90      0.82       300
+           1       0.72      0.68      0.70       216
+           2       0.50      0.08      0.14        86
+           3       0.27      0.07      0.11        46
+           4       0.40      0.07      0.12        83
+           5       0.00      0.00      0.00         0
+           6       0.77      0.82      0.79       245
+           7       0.33      0.10      0.15        42
+           8       0.67      0.40      0.50       127
+           9       0.00      0.00      0.00        12
+          10       0.71      0.37      0.49       127
+          11       0.00      0.00      0.00        14
+          12       0.49      0.31      0.38       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.70      0.55      0.62      1404
+   macro avg       0.40      0.27      0.30      1404
+weighted avg       0.64      0.55      0.56      1404
+ samples avg       0.73      0.59      0.61      1404
--- a/games_march2025_cleaned_2k_i3k/ComplementNB.txt
+++ b/games_march2025_cleaned_2k_i3k/ComplementNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.67      0.98      0.80       300
+           1       0.81      0.36      0.50       216
+           2       0.67      0.05      0.09        86
+           3       0.00      0.00      0.00        46
+           4       0.80      0.05      0.09        83
+           5       0.00      0.00      0.00         0
+           6       0.77      0.81      0.79       245
+           7       0.40      0.05      0.09        42
+           8       0.83      0.04      0.08       127
+           9       0.00      0.00      0.00        12
+          10       0.43      0.02      0.04       127
+          11       0.00      0.00      0.00        14
+          12       1.00      0.05      0.09       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.70      0.42      0.53      1404
+   macro avg       0.46      0.17      0.18      1404
+weighted avg       0.69      0.42      0.42      1404
+ samples avg       0.71      0.46      0.52      1404
--- a/games_march2025_cleaned_2k_i3k/DecisionTreeClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/DecisionTreeClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.76      0.73      0.75       300
+           1       0.56      0.53      0.54       216
+           2       0.36      0.33      0.34        86
+           3       0.33      0.26      0.29        46
+           4       0.40      0.46      0.43        83
+           5       0.00      0.00      0.00         0
+           6       0.65      0.61      0.63       245
+           7       0.39      0.40      0.40        42
+           8       0.59      0.57      0.58       127
+           9       0.60      0.25      0.35        12
+          10       0.56      0.51      0.53       127
+          11       0.39      0.50      0.44        14
+          12       0.52      0.49      0.50       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.58      0.55      0.57      1404
+   macro avg       0.44      0.40      0.41      1404
+weighted avg       0.58      0.55      0.57      1404
+ samples avg       0.59      0.59      0.55      1404
--- a/games_march2025_cleaned_2k_i3k/DummyClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/DummyClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.60      1.00      0.75       300
+           1       0.00      0.00      0.00       216
+           2       0.00      0.00      0.00        86
+           3       0.00      0.00      0.00        46
+           4       0.00      0.00      0.00        83
+           5       0.00      0.00      0.00         0
+           6       0.00      0.00      0.00       245
+           7       0.00      0.00      0.00        42
+           8       0.00      0.00      0.00       127
+           9       0.00      0.00      0.00        12
+          10       0.00      0.00      0.00       127
+          11       0.00      0.00      0.00        14
+          12       0.00      0.00      0.00       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.60      0.21      0.32      1404
+   macro avg       0.04      0.07      0.05      1404
+weighted avg       0.13      0.21      0.16      1404
+ samples avg       0.60      0.26      0.34      1404
--- a/games_march2025_cleaned_2k_i3k/ExtraTreesClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/ExtraTreesClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.81      0.91      0.86       300
+           1       0.78      0.62      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.04      0.07        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.73      0.75       245
+           7       0.00      0.00      0.00        42
+           8       0.84      0.24      0.38       127
+           9       1.00      0.17      0.29        12
+          10       0.90      0.21      0.34       127
+          11       1.00      0.14      0.25        14
+          12       0.83      0.18      0.29       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.80      0.48      0.60      1404
+   macro avg       0.64      0.23      0.29      1404
+weighted avg       0.79      0.48      0.52      1404
+ samples avg       0.78      0.54      0.60      1404
--- a/games_march2025_cleaned_2k_i3k/GaussianNB.txt
+++ b/games_march2025_cleaned_2k_i3k/GaussianNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.76      0.80      0.78       300
+           1       0.62      0.51      0.56       216
+           2       0.63      0.14      0.23        86
+           3       0.17      0.02      0.04        46
+           4       0.42      0.10      0.16        83
+           5       0.00      0.00      0.00         0
+           6       0.68      0.66      0.67       245
+           7       0.56      0.12      0.20        42
+           8       0.55      0.33      0.41       127
+           9       0.67      0.17      0.27        12
+          10       0.65      0.31      0.42       127
+          11       1.00      0.14      0.25        14
+          12       0.53      0.29      0.38       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.66      0.47      0.55      1404
+   macro avg       0.52      0.26      0.31      1404
+weighted avg       0.62      0.47      0.51      1404
+ samples avg       0.67      0.53      0.55      1404
--- a/games_march2025_cleaned_2k_i3k/GradientBoostingClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/GradientBoostingClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.87      0.80      0.83       300
+           1       0.77      0.61      0.68       216
+           2       0.55      0.13      0.21        86
+           3       0.42      0.11      0.17        46
+           4       0.68      0.33      0.44        83
+           5       0.00      0.00      0.00         0
+           6       0.71      0.76      0.74       245
+           7       0.61      0.26      0.37        42
+           8       0.81      0.50      0.61       127
+           9       0.75      0.25      0.38        12
+          10       0.81      0.54      0.65       127
+          11       0.40      0.43      0.41        14
+          12       0.69      0.42      0.53       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.76      0.57      0.65      1404
+   macro avg       0.58      0.37      0.43      1404
+weighted avg       0.74      0.57      0.63      1404
+ samples avg       0.77      0.63      0.65      1404
--- a/games_march2025_cleaned_2k_i3k/HistGradientBoostingClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/HistGradientBoostingClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.83      0.83      0.83       300
+           1       0.74      0.69      0.72       216
+           2       0.80      0.28      0.41        86
+           3       1.00      0.04      0.08        46
+           4       0.70      0.39      0.50        83
+           5       0.00      0.00      0.00         0
+           6       0.72      0.76      0.74       245
+           7       0.73      0.19      0.30        42
+           8       0.85      0.59      0.70       127
+           9       1.00      0.33      0.50        12
+          10       0.78      0.54      0.64       127
+          11       0.43      0.21      0.29        14
+          12       0.77      0.52      0.62       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.78      0.61      0.68      1404
+   macro avg       0.67      0.38      0.45      1404
+weighted avg       0.78      0.61      0.66      1404
+ samples avg       0.79      0.67      0.69      1404
--- a/games_march2025_cleaned_2k_i3k/KNeighborsClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/KNeighborsClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.82      0.62      0.70       300
+           1       0.69      0.46      0.55       216
+           2       0.62      0.06      0.11        86
+           3       0.20      0.02      0.04        46
+           4       0.72      0.16      0.26        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.55      0.64       245
+           7       0.38      0.12      0.18        42
+           8       0.59      0.65      0.62       127
+           9       1.00      0.67      0.80        12
+          10       0.68      0.44      0.54       127
+          11       1.00      0.29      0.44        14
+          12       0.34      0.76      0.48       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.64      0.48      0.55      1404
+   macro avg       0.56      0.34      0.38      1404
+weighted avg       0.68      0.48      0.53      1404
+ samples avg       0.64      0.54      0.55      1404
--- a/games_march2025_cleaned_2k_i3k/LinearDiscriminantAnalysis.txt
+++ b/games_march2025_cleaned_2k_i3k/LinearDiscriminantAnalysis.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.63      0.68      0.66       300
+           1       0.47      0.56      0.51       216
+           2       0.27      0.59      0.37        86
+           3       0.06      0.28      0.10        46
+           4       0.21      0.52      0.30        83
+           5       0.00      0.00      0.00         0
+           6       0.63      0.67      0.65       245
+           7       0.06      0.29      0.10        42
+           8       0.28      0.52      0.36       127
+           9       0.03      0.42      0.06        12
+          10       0.29      0.52      0.38       127
+          11       0.04      0.43      0.07        14
+          12       0.53      0.44      0.48       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.30      0.57      0.39      1404
+   macro avg       0.25      0.42      0.29      1404
+weighted avg       0.44      0.57      0.48      1404
+ samples avg       0.42      0.62      0.40      1404
--- a/games_march2025_cleaned_2k_i3k/LinearSVC.txt
+++ b/games_march2025_cleaned_2k_i3k/LinearSVC.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.85      0.87      0.86       300
+           1       0.76      0.66      0.70       216
+           2       0.77      0.20      0.31        86
+           3       0.00      0.00      0.00        46
+           4       0.76      0.27      0.39        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.81      0.79       245
+           7       0.89      0.19      0.31        42
+           8       0.77      0.60      0.67       127
+           9       1.00      0.58      0.74        12
+          10       0.85      0.54      0.66       127
+          11       1.00      0.29      0.44        14
+          12       0.82      0.42      0.56       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.80      0.61      0.69      1404
+   macro avg       0.66      0.39      0.46      1404
+weighted avg       0.78      0.61      0.66      1404
+ samples avg       0.81      0.67      0.69      1404
--- a/games_march2025_cleaned_2k_i3k/LogisticRegression.txt
+++ b/games_march2025_cleaned_2k_i3k/LogisticRegression.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.78      0.91      0.84       300
+           1       0.78      0.62      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.04      0.07        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.81      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.90      0.34      0.49       127
+           9       0.00      0.00      0.00        12
+          10       0.89      0.25      0.39       127
+          11       0.00      0.00      0.00        14
+          12       0.88      0.14      0.24       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.50      0.61      1404
+   macro avg       0.50      0.22      0.26      1404
+weighted avg       0.77      0.50      0.53      1404
+ samples avg       0.77      0.56      0.60      1404
--- a/games_march2025_cleaned_2k_i3k/MLPClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/MLPClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.84      0.85      0.84       300
+           1       0.73      0.67      0.70       216
+           2       0.74      0.30      0.43        86
+           3       0.50      0.02      0.04        46
+           4       0.69      0.24      0.36        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.79      0.79       245
+           7       0.86      0.14      0.24        42
+           8       0.76      0.63      0.69       127
+           9       1.00      0.33      0.50        12
+          10       0.81      0.52      0.63       127
+          11       1.00      0.14      0.25        14
+          12       0.75      0.41      0.53       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.60      0.68      1404
+   macro avg       0.68      0.36      0.43      1404
+weighted avg       0.78      0.60      0.65      1404
+ samples avg       0.80      0.66      0.68      1404
--- a/games_march2025_cleaned_2k_i3k/MultinomialNB.txt
+++ b/games_march2025_cleaned_2k_i3k/MultinomialNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.64      0.99      0.78       300
+           1       0.85      0.24      0.37       216
+           2       0.60      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       0.80      0.05      0.09        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.80      0.79       245
+           7       0.40      0.05      0.09        42
+           8       1.00      0.04      0.08       127
+           9       0.00      0.00      0.00        12
+          10       0.20      0.01      0.02       127
+          11       0.00      0.00      0.00        14
+          12       1.00      0.05      0.09       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.69      0.40      0.51      1404
+   macro avg       0.45      0.16      0.17      1404
+weighted avg       0.68      0.40      0.39      1404
+ samples avg       0.70      0.44      0.50      1404
--- a/games_march2025_cleaned_2k_i3k/NearestCentroid.txt
+++ b/games_march2025_cleaned_2k_i3k/NearestCentroid.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.83      0.75      0.79       300
+           1       0.65      0.75      0.70       216
+           2       0.43      0.72      0.54        86
+           3       0.18      0.33      0.23        46
+           4       0.46      0.61      0.53        83
+           5       0.00      0.00      0.00         0
+           6       0.74      0.76      0.75       245
+           7       0.31      0.62      0.41        42
+           8       0.47      0.69      0.55       127
+           9       1.00      0.67      0.80        12
+          10       0.59      0.69      0.64       127
+          11       0.60      0.64      0.62        14
+          12       0.42      0.66      0.52       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.57      0.70      0.63      1404
+   macro avg       0.48      0.56      0.50      1404
+weighted avg       0.62      0.70      0.65      1404
+ samples avg       0.63      0.74      0.64      1404
--- a/games_march2025_cleaned_2k_i3k/PassiveAggressiveClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/PassiveAggressiveClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.84      0.86      0.85       300
+           1       0.74      0.63      0.68       216
+           2       0.77      0.31      0.45        86
+           3       0.50      0.04      0.08        46
+           4       0.69      0.33      0.44        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.80      0.79       245
+           7       0.69      0.26      0.38        42
+           8       0.74      0.62      0.68       127
+           9       1.00      0.67      0.80        12
+          10       0.80      0.57      0.67       127
+          11       1.00      0.50      0.67        14
+          12       0.79      0.46      0.58       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.62      0.69      1404
+   macro avg       0.67      0.43      0.50      1404
+weighted avg       0.77      0.62      0.67      1404
+ samples avg       0.80      0.68      0.70      1404
--- a/games_march2025_cleaned_2k_i3k/Perceptron.txt
+++ b/games_march2025_cleaned_2k_i3k/Perceptron.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.78      0.94      0.85       300
+           1       0.60      0.88      0.71       216
+           2       0.54      0.60      0.57        86
+           3       0.33      0.04      0.08        46
+           4       0.68      0.16      0.25        83
+           5       0.00      0.00      0.00         0
+           6       0.74      0.86      0.80       245
+           7       0.63      0.29      0.39        42
+           8       0.62      0.80      0.69       127
+           9       1.00      0.67      0.80        12
+          10       0.89      0.43      0.58       127
+          11       0.70      0.50      0.58        14
+          12       0.88      0.27      0.42       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.70      0.68      0.69      1404
+   macro avg       0.60      0.46      0.48      1404
+weighted avg       0.71      0.68      0.66      1404
+ samples avg       0.72      0.74      0.69      1404
--- a/games_march2025_cleaned_2k_i3k/RandomForestClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/RandomForestClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.80      0.88      0.84       300
+           1       0.78      0.55      0.64       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.06      0.11        83
+           5       0.00      0.00      0.00         0
+           6       0.74      0.78      0.76       245
+           7       0.00      0.00      0.00        42
+           8       0.84      0.24      0.38       127
+           9       0.00      0.00      0.00        12
+          10       0.91      0.24      0.38       127
+          11       1.00      0.14      0.25        14
+          12       1.00      0.25      0.39       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.48      0.59      1404
+   macro avg       0.58      0.23      0.27      1404
+weighted avg       0.78      0.48      0.52      1404
+ samples avg       0.77      0.54      0.60      1404
--- a/games_march2025_cleaned_2k_i3k/RidgeClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/RidgeClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.84      0.88      0.86       300
+           1       0.76      0.66      0.70       216
+           2       0.80      0.14      0.24        86
+           3       0.00      0.00      0.00        46
+           4       0.85      0.20      0.33        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.82      0.80       245
+           7       0.86      0.14      0.24        42
+           8       0.79      0.54      0.64       127
+           9       1.00      0.42      0.59        12
+          10       0.88      0.50      0.64       127
+          11       1.00      0.14      0.25        14
+          12       0.83      0.38      0.52       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.81      0.59      0.68      1404
+   macro avg       0.67      0.34      0.42      1404
+weighted avg       0.79      0.59      0.63      1404
+ samples avg       0.81      0.65      0.68      1404
--- a/games_march2025_cleaned_2k_i3k/SGDClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/SGDClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.86      0.84      0.85       300
+           1       0.80      0.52      0.63       216
+           2       0.68      0.35      0.46        86
+           3       0.44      0.09      0.15        46
+           4       0.68      0.34      0.45        83
+           5       0.00      0.00      0.00         0
+           6       0.77      0.80      0.79       245
+           7       0.71      0.24      0.36        42
+           8       0.75      0.55      0.64       127
+           9       1.00      0.58      0.74        12
+          10       0.85      0.52      0.64       127
+          11       0.89      0.57      0.70        14
+          12       0.60      0.64      0.62       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.77      0.61      0.68      1404
+   macro avg       0.65      0.43      0.50      1404
+weighted avg       0.77      0.61      0.66      1404
+ samples avg       0.79      0.67      0.69      1404
--- a/games_march2025_cleaned_2k_i3k/SVC.txt
+++ b/games_march2025_cleaned_2k_i3k/SVC.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.81      0.90      0.85       300
+           1       0.76      0.63      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.05      0.09        83
+           5       0.00      0.00      0.00         0
+           6       0.77      0.83      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.84      0.40      0.54       127
+           9       1.00      0.17      0.29        12
+          10       0.90      0.34      0.49       127
+          11       1.00      0.14      0.25        14
+          12       0.92      0.21      0.34       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.80      0.53      0.63      1404
+   macro avg       0.64      0.26      0.32      1404
+weighted avg       0.79      0.53      0.56      1404
+ samples avg       0.79      0.59      0.63      1404
--- a/notebook.ipynb
+++ b/notebook.ipynb
@@ -304,13 +304,47 @@
    "``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n",
    "As such, we tested many different models on the 2k Dataset and chose the 5 best performing ones for the big dataset.\n",
    "\n",
-    "### The comparison\n",
-    "We won't put the comparison script in this notebook, but you can find it in the ``compare_models.py`` file and try it out yourself.\n",
+    "### Initial Comparison\n",
+    "We won't put the comparison script in this notebook, but you can find it in the ``compare_models_2k.py`` file and try it out yourself.\n",
    "There were some rules as a baseline for comparison:\n",
    "- All Hyperparameters are set to default\n",
-    "- All iteration limits are set to 3000\n",
+    "- All iteration limits are set to 3000 (exception: MLPClassifier with 300, where i-limit are epochs instead of iterations )\n",
+    "- All ``random_state``s are set to 0\n",
    "\n",
-    "![Comparison Image](./compare_models_2k.png)"
+    "Running all models with that configuration yields the following weighted F1-Scores (results as seen in the ``games_march2025_cleaned_2k_i3k`` folder): \n",
+    "\n",
+    "![Comparison Image 2k](./compare_models_2k.png)\n",
+    "\n",
+    "If we also compare Micro/Macro values, we see that all models have a much lower Macro-F1 than Micro/Weighted-F1. That is because the 2k Dataset does not contain enough datapoints for every class (test data for 2 classes is 0), so we should proceed to the 10k Dataset before making major choices.\n",
+    "\n",
+    "![Comparison Image 2k Micro/Macro/Weighted](./compare_models_2k_3.png)\n",
+    "\n",
+    "The 10 best performing models which will run on the 10k Dataset with the same rules as before:\n",
+    "1. NearestCentroid\n",
+    "2. Perceptron\n",
+    "3. PassiveAggressiveClassifier\n",
+    "4. LinearSVC\n",
+    "5. SDGClassifer\n",
+    "6. HistGradientBoostingClassifier\n",
+    "7. MLPClassifier\n",
+    "8. RidgeClassifier\n",
+    "9. GradientBoostingClassifier\n",
+    "10. LinearDiscriminationAnalysis\n",
+    "\n",
+    "![Comparison Image 10k](./compare_models_10k.png)\n",
+    "\n",
+    "We can also compare these models between datasets, to see if a bigger dataset always improves the performance.\n",
+    "\n",
+    "![Comparison Image between 2k and 10k](./compare_models_2k_10k.png)\n",
+    "\n",
+    "The final contenders are:\n",
+    "1.\n",
+    "2.\n",
+    "3.\n",
+    "4.\n",
+    "5.\n",
+    "\n",
+    "..."
   ]
  },
  {