changes

2025-08-18 20:22:59 +02:00
parent 28df88c0bf
commit 530d312dfd
31 changed files with 725 additions and 20 deletions
--- a/compare_dataset_sizes.png
+++ b/compare_dataset_sizes.png
--- a/compare_dataset_sizes.py
+++ b/compare_dataset_sizes.py
@@ -22,17 +22,20 @@ for dataset_name, folder in datasets.items():
                        results[dataset_name][model_name] = f1_score
 # Plot
-models = sorted(results["cleaned"].keys())  # alphabetisch sortieren für gleiche Reihenfolge
+#models = sorted(results["cleaned_2k"].keys())  # alphabetisch sortieren für gleiche Reihenfolge
 models = dict(sorted(results["cleaned_2k"].items(), key=lambda i: i[1], reverse=True)) # nach values sortieren
 x = range(len(models))
 plt.figure(figsize=(12,6))
-plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned")
+#plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned")
-plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.25, label="cleaned_2k")
+plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.5)#, label="cleaned_2k")
-plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k")
+#plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k")
-plt.xticks(x, models, rotation=45)
+plt.xticks(x, models, rotation=90)
-plt.ylabel("F1-Score")
+plt.ylim(0, 1) # min max
 plt.ylabel("Weighted F1-Score")
 plt.title("Model Performance across Datasets")
-plt.legend()
+#plt.legend()
 plt.tight_layout()
 plt.savefig('compare_graph_latest.png')
 plt.show()
--- a/compare_graph_maker_3.py
+++ b/compare_graph_maker_3.py
@@ -0,0 +1,59 @@
 import os
 import matplotlib.pyplot as plt
 import numpy as np
 datasets = {
    #"cleaned": "games_march2025_cleaned",
    #"cleaned_2k": "games_march2025_cleaned_2k",
    #"cleaned_10k": "games_march2025_cleaned_10k"
    "cleaned_2k": "games_march2025_cleaned_2k_i3k",
 }
 # def results
 results = {}
 for dataset_name, folder in datasets.items():
    results[dataset_name] = {}
    for filename in os.listdir(folder):
        if filename.endswith(".txt"):
            model_name = filename.replace(".txt", "")
            print("model " + model_name)
            results[dataset_name][model_name] = {}
            with open(os.path.join(folder, filename), "r") as f:
                for line in f:
                    if line.strip().startswith("micro avg"):
                        print("micro")
                        results[dataset_name][model_name][0] = float(line.split()[4]) # micro f1
                    if line.strip().startswith("macro avg"):
                        print("macro")
                        results[dataset_name][model_name][1] = float(line.split()[4]) # macro f1
                    if line.strip().startswith("weighted avg"):
                        print("weight")
                        results[dataset_name][model_name][2] = float(line.split()[4]) # weighted avg f1
 # Plot
 #models = sorted(results["cleaned_2k"].keys())  # alphabetisch sortieren für gleiche Reihenfolge
 models = dict(sorted(results["cleaned_2k"].items(), key=lambda i: i[1][2], reverse=True)) # nach values sortieren
 print(models)
 x = range(len(models))
 fig = plt.figure()
 #ax = fig.add_subplot(projection='3d')
 plt.bar([i - 0.25 for i in x], [results["cleaned_2k"][m][0] for m in models], width=0.25, label="Micro")
 plt.bar(x,                     [results["cleaned_2k"][m][1] for m in models], width=0.25, label="Macro")
 plt.bar([i + 0.25 for i in x], [results["cleaned_2k"][m][2] for m in models], width=0.25, label="Weighted")
 plt.xticks(x, models, rotation=90)
 plt.ylabel("F1 Score")
 #ax.set_zlabel("F1 Value")
 plt.ylim(0,1)
 plt.title("Model Performance - 2k Dataset")
 plt.legend()
 plt.tight_layout()
 plt.savefig('compare_graph_latest_3.png')
 plt.show()
 # On the y-axis let's only label the discrete values that we have data for.
 #ax.set_yticks(yticks)
 plt.show()
--- a/compare_models_10k.py
+++ b/compare_models_10k.py
@@ -0,0 +1,126 @@
 import os
 import numpy as np
 import pandas as pd
 from sklearn import set_config
 from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer
 import ast
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.multioutput import MultiOutputClassifier
 from sklearn.metrics import classification_report
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_iris
 from sklearn.metrics import accuracy_score, classification_report
 from sklearn.svm import SVC, LinearSVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, SGDClassifier
 from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, StackingClassifier
 from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
 from sklearn.dummy import DummyClassifier
 from sklearn.neural_network import MLPClassifier
 set_config(transform_output="pandas") # dataframe supremacy
 jobs = 12
 max_iter = 3000
 def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
    dataset = pd.read_csv(dataset,sep=",")
    # desc, genres, tags
    column_transformer = ColumnTransformer([
            # merge all descriptions
            ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
                ['detailed_description', 'about_the_game', 'short_description']),
            ('pass', 'passthrough', ['genres']),#, 'tags'
        ],
        verbose_feature_names_out=False
    )
    dataset = column_transformer.fit_transform(dataset)
    #### SET MISSING VALUES
    print("SETMISS")
    # Setting missing numeric values to the mean
    dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
    # Setting missing text values to 'Unknown'
    dataset.fillna('', inplace=True)
    # Setting missing values in other columns to NaN
    dataset.dropna(inplace=True)
    ##### STRUCTURIZE GENRES to onehot
    #serialize array
    dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) 
    #print(dataset['genres']) # in py but not yet onehotenc
    # MultiLabelBinarizer does onehotenc for arrays
    mlb_genres = MultiLabelBinarizer()
    genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
    #genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
    genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
    #print(genres_df)
    #dataset = pd.concat([dataset, genres_df], axis=1)
    #print(dataset)
    #### convert text to bag of words
    ## Count vs Tfidf vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    #print(tfidf_df)
    ##### MODEL
    print("MODEL")
    X = tfidf_df
    y = genres_df
    # cleanup datapoints that dont have a target value (all target columns are 0)
    mask = y.sum(axis=1).map(lambda x: x > 0)
    #print((mask == False).sum()) #31 cases with all target columns 0
    X_clean = X[mask]
    y_clean = y[mask]
    # Split dataset
    return train_test_split(X_clean, y_clean, random_state=0)
 def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report
    multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0)
    # model training
    multi_target_clf.fit(X_train, y_train)
    # predict against test data
    y_pred = multi_target_clf.predict(X_test)
    return classification_report(y_test, y_pred, zero_division=0.0)
 datasets = [
    #'games_march2025_cleaned_2k.csv',
    'games_march2025_cleaned_10k.csv',
    #'games_march2025_cleaned.csv'
 ]
 estimators = {
    "RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter),
    "PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter),
    "Perceptron": Perceptron(random_state=0, max_iter=max_iter),
    "SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter),
    "NearestCentroid": NearestCentroid(),
    "LinearSVC": LinearSVC(random_state=0, max_iter=max_iter),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
    "HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter),
    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
    "MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/20), early_stopping=True),
 }
 #"VotingClassifier": VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]),
 #"StackingClassifier": StackingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]),
 for dataset in datasets:
    print("-" * 60)
    print("dataset -> " + dataset)
    print("mkdir")
    folder = dataset.split(".csv")[0]
    if not os.path.isdir(folder):
        os.mkdir(folder)
    X_train, X_test, y_train, y_test = prepDataset(dataset)
    for esti in estimators:
        print("model: " + esti)
        compari = comparison(X_train, X_test, y_train, y_test, estimators[esti])
        print("open")
        f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8")
        f.write(compari)
        print("write")
        f.close()
        print("close")
--- a/compare_models_2k.png
+++ b/compare_models_2k.png
--- a/generate_compare_dataset.py
+++ b/generate_compare_dataset.py
@@ -27,6 +27,10 @@ from sklearn.dummy import DummyClassifier
 from sklearn.neural_network import MLPClassifier
 set_config(transform_output="pandas") # dataframe supremacy
 jobs = 12
 max_iter = 3000
 def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
    dataset = pd.read_csv(dataset,sep=",")
    # desc, genres, tags
@@ -76,7 +80,7 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
    y_clean = y[mask]
    # Split dataset
    return train_test_split(X_clean, y_clean, random_state=0)
-def comparison(X_train, X_test, y_train, y_test, estimator, jobs: int = 1): #returns class_report
+def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report
    multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0)
    # model training
    multi_target_clf.fit(X_train, y_train)
@@ -88,9 +92,6 @@ datasets = [
    #'games_march2025_cleaned_10k.csv',
    #'games_march2025_cleaned.csv'
 ]
 max_iter = 3000  # <-- set your desired value here
 estimators = {
    "LogisticRegression": LogisticRegression(random_state=0, max_iter=max_iter),
    "RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter),
@@ -99,8 +100,8 @@ estimators = {
    "SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "NearestCentroid": NearestCentroid(),
-    "RadiusNeighborsClassifier": RadiusNeighborsClassifier(),
+    # "RadiusNeighborsClassifier": RadiusNeighborsClassifier(), # failed bcs no neighbours in range :sob:
-    "LinearSVC-i5000": LinearSVC(random_state=0, max_iter=max_iter),
+    "LinearSVC": LinearSVC(random_state=0, max_iter=max_iter),
    "SVC": SVC(random_state=0, max_iter=max_iter),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
    "RandomForestClassifier": RandomForestClassifier(random_state=0),
@@ -114,8 +115,7 @@ estimators = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
-    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
+    "MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/5), verbose=True),
    "MLPClassifier-i10000": MLPClassifier(max_iter=10000, random_state=0),
    "DummyClassifier": DummyClassifier(random_state=0)
 }
@@ -131,7 +131,7 @@ for dataset in datasets:
    X_train, X_test, y_train, y_test = prepDataset(dataset)
    for esti in estimators:
        print("model: " + esti)
-        compari = comparison(X_train, X_test, y_train, y_test, estimators[esti], 1) #TODO: change the job count if you can
+        compari = comparison(X_train, X_test, y_train, y_test, estimators[esti])
        print("open")
        f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8")
        f.write(compari)
--- a/compare_models_2k_3.png
+++ b/compare_models_2k_3.png
--- a/games_march2025_cleaned_2k_i3k/AdaBoostClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/AdaBoostClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.87      0.76      0.81       300
           1       0.70      0.59      0.64       216
           2       0.58      0.13      0.21        86
           3       0.56      0.11      0.18        46
           4       0.71      0.30      0.42        83
           5       0.00      0.00      0.00         0
           6       0.69      0.70      0.69       245
           7       0.62      0.31      0.41        42
           8       0.76      0.41      0.53       127
           9       1.00      0.50      0.67        12
          10       0.67      0.50      0.57       127
          11       0.40      0.29      0.33        14
          12       0.74      0.45      0.56       106
          13       0.00      0.00      0.00         0
   micro avg       0.74      0.54      0.62      1404
   macro avg       0.59      0.36      0.43      1404
 weighted avg       0.73      0.54      0.60      1404
 samples avg       0.74      0.59      0.61      1404
--- a/games_march2025_cleaned_2k_i3k/BaggingClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/BaggingClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.86      0.70      0.77       300
           1       0.72      0.50      0.59       216
           2       0.47      0.09      0.16        86
           3       0.50      0.04      0.08        46
           4       0.58      0.23      0.33        83
           5       0.00      0.00      0.00         0
           6       0.71      0.64      0.67       245
           7       0.80      0.29      0.42        42
           8       0.79      0.46      0.58       127
           9       1.00      0.25      0.40        12
          10       0.71      0.43      0.53       127
          11       0.40      0.29      0.33        14
          12       0.68      0.42      0.52       106
          13       0.00      0.00      0.00         0
   micro avg       0.74      0.49      0.59      1404
   macro avg       0.59      0.31      0.39      1404
 weighted avg       0.72      0.49      0.56      1404
 samples avg       0.70      0.54      0.57      1404
--- a/games_march2025_cleaned_2k_i3k/BernoulliNB.txt
+++ b/games_march2025_cleaned_2k_i3k/BernoulliNB.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.75      0.90      0.82       300
           1       0.72      0.68      0.70       216
           2       0.50      0.08      0.14        86
           3       0.27      0.07      0.11        46
           4       0.40      0.07      0.12        83
           5       0.00      0.00      0.00         0
           6       0.77      0.82      0.79       245
           7       0.33      0.10      0.15        42
           8       0.67      0.40      0.50       127
           9       0.00      0.00      0.00        12
          10       0.71      0.37      0.49       127
          11       0.00      0.00      0.00        14
          12       0.49      0.31      0.38       106
          13       0.00      0.00      0.00         0
   micro avg       0.70      0.55      0.62      1404
   macro avg       0.40      0.27      0.30      1404
 weighted avg       0.64      0.55      0.56      1404
 samples avg       0.73      0.59      0.61      1404
--- a/games_march2025_cleaned_2k_i3k/ComplementNB.txt
+++ b/games_march2025_cleaned_2k_i3k/ComplementNB.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.67      0.98      0.80       300
           1       0.81      0.36      0.50       216
           2       0.67      0.05      0.09        86
           3       0.00      0.00      0.00        46
           4       0.80      0.05      0.09        83
           5       0.00      0.00      0.00         0
           6       0.77      0.81      0.79       245
           7       0.40      0.05      0.09        42
           8       0.83      0.04      0.08       127
           9       0.00      0.00      0.00        12
          10       0.43      0.02      0.04       127
          11       0.00      0.00      0.00        14
          12       1.00      0.05      0.09       106
          13       0.00      0.00      0.00         0
   micro avg       0.70      0.42      0.53      1404
   macro avg       0.46      0.17      0.18      1404
 weighted avg       0.69      0.42      0.42      1404
 samples avg       0.71      0.46      0.52      1404
--- a/games_march2025_cleaned_2k_i3k/DecisionTreeClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/DecisionTreeClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.76      0.73      0.75       300
           1       0.56      0.53      0.54       216
           2       0.36      0.33      0.34        86
           3       0.33      0.26      0.29        46
           4       0.40      0.46      0.43        83
           5       0.00      0.00      0.00         0
           6       0.65      0.61      0.63       245
           7       0.39      0.40      0.40        42
           8       0.59      0.57      0.58       127
           9       0.60      0.25      0.35        12
          10       0.56      0.51      0.53       127
          11       0.39      0.50      0.44        14
          12       0.52      0.49      0.50       106
          13       0.00      0.00      0.00         0
   micro avg       0.58      0.55      0.57      1404
   macro avg       0.44      0.40      0.41      1404
 weighted avg       0.58      0.55      0.57      1404
 samples avg       0.59      0.59      0.55      1404
--- a/games_march2025_cleaned_2k_i3k/DummyClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/DummyClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.60      1.00      0.75       300
           1       0.00      0.00      0.00       216
           2       0.00      0.00      0.00        86
           3       0.00      0.00      0.00        46
           4       0.00      0.00      0.00        83
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00       245
           7       0.00      0.00      0.00        42
           8       0.00      0.00      0.00       127
           9       0.00      0.00      0.00        12
          10       0.00      0.00      0.00       127
          11       0.00      0.00      0.00        14
          12       0.00      0.00      0.00       106
          13       0.00      0.00      0.00         0
   micro avg       0.60      0.21      0.32      1404
   macro avg       0.04      0.07      0.05      1404
 weighted avg       0.13      0.21      0.16      1404
 samples avg       0.60      0.26      0.34      1404
--- a/games_march2025_cleaned_2k_i3k/ExtraTreesClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/ExtraTreesClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.81      0.91      0.86       300
           1       0.78      0.62      0.69       216
           2       1.00      0.03      0.07        86
           3       0.00      0.00      0.00        46
           4       1.00      0.04      0.07        83
           5       0.00      0.00      0.00         0
           6       0.78      0.73      0.75       245
           7       0.00      0.00      0.00        42
           8       0.84      0.24      0.38       127
           9       1.00      0.17      0.29        12
          10       0.90      0.21      0.34       127
          11       1.00      0.14      0.25        14
          12       0.83      0.18      0.29       106
          13       0.00      0.00      0.00         0
   micro avg       0.80      0.48      0.60      1404
   macro avg       0.64      0.23      0.29      1404
 weighted avg       0.79      0.48      0.52      1404
 samples avg       0.78      0.54      0.60      1404
--- a/games_march2025_cleaned_2k_i3k/GaussianNB.txt
+++ b/games_march2025_cleaned_2k_i3k/GaussianNB.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.76      0.80      0.78       300
           1       0.62      0.51      0.56       216
           2       0.63      0.14      0.23        86
           3       0.17      0.02      0.04        46
           4       0.42      0.10      0.16        83
           5       0.00      0.00      0.00         0
           6       0.68      0.66      0.67       245
           7       0.56      0.12      0.20        42
           8       0.55      0.33      0.41       127
           9       0.67      0.17      0.27        12
          10       0.65      0.31      0.42       127
          11       1.00      0.14      0.25        14
          12       0.53      0.29      0.38       106
          13       0.00      0.00      0.00         0
   micro avg       0.66      0.47      0.55      1404
   macro avg       0.52      0.26      0.31      1404
 weighted avg       0.62      0.47      0.51      1404
 samples avg       0.67      0.53      0.55      1404
--- a/games_march2025_cleaned_2k_i3k/GradientBoostingClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/GradientBoostingClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.87      0.80      0.83       300
           1       0.77      0.61      0.68       216
           2       0.55      0.13      0.21        86
           3       0.42      0.11      0.17        46
           4       0.68      0.33      0.44        83
           5       0.00      0.00      0.00         0
           6       0.71      0.76      0.74       245
           7       0.61      0.26      0.37        42
           8       0.81      0.50      0.61       127
           9       0.75      0.25      0.38        12
          10       0.81      0.54      0.65       127
          11       0.40      0.43      0.41        14
          12       0.69      0.42      0.53       106
          13       0.00      0.00      0.00         0
   micro avg       0.76      0.57      0.65      1404
   macro avg       0.58      0.37      0.43      1404
 weighted avg       0.74      0.57      0.63      1404
 samples avg       0.77      0.63      0.65      1404
--- a/games_march2025_cleaned_2k_i3k/HistGradientBoostingClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/HistGradientBoostingClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.83      0.83      0.83       300
           1       0.74      0.69      0.72       216
           2       0.80      0.28      0.41        86
           3       1.00      0.04      0.08        46
           4       0.70      0.39      0.50        83
           5       0.00      0.00      0.00         0
           6       0.72      0.76      0.74       245
           7       0.73      0.19      0.30        42
           8       0.85      0.59      0.70       127
           9       1.00      0.33      0.50        12
          10       0.78      0.54      0.64       127
          11       0.43      0.21      0.29        14
          12       0.77      0.52      0.62       106
          13       0.00      0.00      0.00         0
   micro avg       0.78      0.61      0.68      1404
   macro avg       0.67      0.38      0.45      1404
 weighted avg       0.78      0.61      0.66      1404
 samples avg       0.79      0.67      0.69      1404
--- a/games_march2025_cleaned_2k_i3k/KNeighborsClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/KNeighborsClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.82      0.62      0.70       300
           1       0.69      0.46      0.55       216
           2       0.62      0.06      0.11        86
           3       0.20      0.02      0.04        46
           4       0.72      0.16      0.26        83
           5       0.00      0.00      0.00         0
           6       0.78      0.55      0.64       245
           7       0.38      0.12      0.18        42
           8       0.59      0.65      0.62       127
           9       1.00      0.67      0.80        12
          10       0.68      0.44      0.54       127
          11       1.00      0.29      0.44        14
          12       0.34      0.76      0.48       106
          13       0.00      0.00      0.00         0
   micro avg       0.64      0.48      0.55      1404
   macro avg       0.56      0.34      0.38      1404
 weighted avg       0.68      0.48      0.53      1404
 samples avg       0.64      0.54      0.55      1404
--- a/games_march2025_cleaned_2k_i3k/LinearDiscriminantAnalysis.txt
+++ b/games_march2025_cleaned_2k_i3k/LinearDiscriminantAnalysis.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.63      0.68      0.66       300
           1       0.47      0.56      0.51       216
           2       0.27      0.59      0.37        86
           3       0.06      0.28      0.10        46
           4       0.21      0.52      0.30        83
           5       0.00      0.00      0.00         0
           6       0.63      0.67      0.65       245
           7       0.06      0.29      0.10        42
           8       0.28      0.52      0.36       127
           9       0.03      0.42      0.06        12
          10       0.29      0.52      0.38       127
          11       0.04      0.43      0.07        14
          12       0.53      0.44      0.48       106
          13       0.00      0.00      0.00         0
   micro avg       0.30      0.57      0.39      1404
   macro avg       0.25      0.42      0.29      1404
 weighted avg       0.44      0.57      0.48      1404
 samples avg       0.42      0.62      0.40      1404
--- a/games_march2025_cleaned_2k_i3k/LinearSVC.txt
+++ b/games_march2025_cleaned_2k_i3k/LinearSVC.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.85      0.87      0.86       300
           1       0.76      0.66      0.70       216
           2       0.77      0.20      0.31        86
           3       0.00      0.00      0.00        46
           4       0.76      0.27      0.39        83
           5       0.00      0.00      0.00         0
           6       0.78      0.81      0.79       245
           7       0.89      0.19      0.31        42
           8       0.77      0.60      0.67       127
           9       1.00      0.58      0.74        12
          10       0.85      0.54      0.66       127
          11       1.00      0.29      0.44        14
          12       0.82      0.42      0.56       106
          13       0.00      0.00      0.00         0
   micro avg       0.80      0.61      0.69      1404
   macro avg       0.66      0.39      0.46      1404
 weighted avg       0.78      0.61      0.66      1404
 samples avg       0.81      0.67      0.69      1404
--- a/games_march2025_cleaned_2k_i3k/LogisticRegression.txt
+++ b/games_march2025_cleaned_2k_i3k/LogisticRegression.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.78      0.91      0.84       300
           1       0.78      0.62      0.69       216
           2       1.00      0.03      0.07        86
           3       0.00      0.00      0.00        46
           4       1.00      0.04      0.07        83
           5       0.00      0.00      0.00         0
           6       0.79      0.81      0.80       245
           7       0.00      0.00      0.00        42
           8       0.90      0.34      0.49       127
           9       0.00      0.00      0.00        12
          10       0.89      0.25      0.39       127
          11       0.00      0.00      0.00        14
          12       0.88      0.14      0.24       106
          13       0.00      0.00      0.00         0
   micro avg       0.79      0.50      0.61      1404
   macro avg       0.50      0.22      0.26      1404
 weighted avg       0.77      0.50      0.53      1404
 samples avg       0.77      0.56      0.60      1404
--- a/games_march2025_cleaned_2k_i3k/MLPClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/MLPClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.84      0.85      0.84       300
           1       0.73      0.67      0.70       216
           2       0.74      0.30      0.43        86
           3       0.50      0.02      0.04        46
           4       0.69      0.24      0.36        83
           5       0.00      0.00      0.00         0
           6       0.79      0.79      0.79       245
           7       0.86      0.14      0.24        42
           8       0.76      0.63      0.69       127
           9       1.00      0.33      0.50        12
          10       0.81      0.52      0.63       127
          11       1.00      0.14      0.25        14
          12       0.75      0.41      0.53       106
          13       0.00      0.00      0.00         0
   micro avg       0.79      0.60      0.68      1404
   macro avg       0.68      0.36      0.43      1404
 weighted avg       0.78      0.60      0.65      1404
 samples avg       0.80      0.66      0.68      1404
--- a/games_march2025_cleaned_2k_i3k/MultinomialNB.txt
+++ b/games_march2025_cleaned_2k_i3k/MultinomialNB.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.64      0.99      0.78       300
           1       0.85      0.24      0.37       216
           2       0.60      0.03      0.07        86
           3       0.00      0.00      0.00        46
           4       0.80      0.05      0.09        83
           5       0.00      0.00      0.00         0
           6       0.78      0.80      0.79       245
           7       0.40      0.05      0.09        42
           8       1.00      0.04      0.08       127
           9       0.00      0.00      0.00        12
          10       0.20      0.01      0.02       127
          11       0.00      0.00      0.00        14
          12       1.00      0.05      0.09       106
          13       0.00      0.00      0.00         0
   micro avg       0.69      0.40      0.51      1404
   macro avg       0.45      0.16      0.17      1404
 weighted avg       0.68      0.40      0.39      1404
 samples avg       0.70      0.44      0.50      1404
--- a/games_march2025_cleaned_2k_i3k/NearestCentroid.txt
+++ b/games_march2025_cleaned_2k_i3k/NearestCentroid.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.83      0.75      0.79       300
           1       0.65      0.75      0.70       216
           2       0.43      0.72      0.54        86
           3       0.18      0.33      0.23        46
           4       0.46      0.61      0.53        83
           5       0.00      0.00      0.00         0
           6       0.74      0.76      0.75       245
           7       0.31      0.62      0.41        42
           8       0.47      0.69      0.55       127
           9       1.00      0.67      0.80        12
          10       0.59      0.69      0.64       127
          11       0.60      0.64      0.62        14
          12       0.42      0.66      0.52       106
          13       0.00      0.00      0.00         0
   micro avg       0.57      0.70      0.63      1404
   macro avg       0.48      0.56      0.50      1404
 weighted avg       0.62      0.70      0.65      1404
 samples avg       0.63      0.74      0.64      1404
--- a/games_march2025_cleaned_2k_i3k/PassiveAggressiveClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/PassiveAggressiveClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.84      0.86      0.85       300
           1       0.74      0.63      0.68       216
           2       0.77      0.31      0.45        86
           3       0.50      0.04      0.08        46
           4       0.69      0.33      0.44        83
           5       0.00      0.00      0.00         0
           6       0.79      0.80      0.79       245
           7       0.69      0.26      0.38        42
           8       0.74      0.62      0.68       127
           9       1.00      0.67      0.80        12
          10       0.80      0.57      0.67       127
          11       1.00      0.50      0.67        14
          12       0.79      0.46      0.58       106
          13       0.00      0.00      0.00         0
   micro avg       0.79      0.62      0.69      1404
   macro avg       0.67      0.43      0.50      1404
 weighted avg       0.77      0.62      0.67      1404
 samples avg       0.80      0.68      0.70      1404
--- a/games_march2025_cleaned_2k_i3k/Perceptron.txt
+++ b/games_march2025_cleaned_2k_i3k/Perceptron.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.78      0.94      0.85       300
           1       0.60      0.88      0.71       216
           2       0.54      0.60      0.57        86
           3       0.33      0.04      0.08        46
           4       0.68      0.16      0.25        83
           5       0.00      0.00      0.00         0
           6       0.74      0.86      0.80       245
           7       0.63      0.29      0.39        42
           8       0.62      0.80      0.69       127
           9       1.00      0.67      0.80        12
          10       0.89      0.43      0.58       127
          11       0.70      0.50      0.58        14
          12       0.88      0.27      0.42       106
          13       0.00      0.00      0.00         0
   micro avg       0.70      0.68      0.69      1404
   macro avg       0.60      0.46      0.48      1404
 weighted avg       0.71      0.68      0.66      1404
 samples avg       0.72      0.74      0.69      1404
--- a/games_march2025_cleaned_2k_i3k/RandomForestClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/RandomForestClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.80      0.88      0.84       300
           1       0.78      0.55      0.64       216
           2       1.00      0.03      0.07        86
           3       0.00      0.00      0.00        46
           4       1.00      0.06      0.11        83
           5       0.00      0.00      0.00         0
           6       0.74      0.78      0.76       245
           7       0.00      0.00      0.00        42
           8       0.84      0.24      0.38       127
           9       0.00      0.00      0.00        12
          10       0.91      0.24      0.38       127
          11       1.00      0.14      0.25        14
          12       1.00      0.25      0.39       106
          13       0.00      0.00      0.00         0
   micro avg       0.79      0.48      0.59      1404
   macro avg       0.58      0.23      0.27      1404
 weighted avg       0.78      0.48      0.52      1404
 samples avg       0.77      0.54      0.60      1404
--- a/games_march2025_cleaned_2k_i3k/RidgeClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/RidgeClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.84      0.88      0.86       300
           1       0.76      0.66      0.70       216
           2       0.80      0.14      0.24        86
           3       0.00      0.00      0.00        46
           4       0.85      0.20      0.33        83
           5       0.00      0.00      0.00         0
           6       0.78      0.82      0.80       245
           7       0.86      0.14      0.24        42
           8       0.79      0.54      0.64       127
           9       1.00      0.42      0.59        12
          10       0.88      0.50      0.64       127
          11       1.00      0.14      0.25        14
          12       0.83      0.38      0.52       106
          13       0.00      0.00      0.00         0
   micro avg       0.81      0.59      0.68      1404
   macro avg       0.67      0.34      0.42      1404
 weighted avg       0.79      0.59      0.63      1404
 samples avg       0.81      0.65      0.68      1404
--- a/games_march2025_cleaned_2k_i3k/SGDClassifier.txt
+++ b/games_march2025_cleaned_2k_i3k/SGDClassifier.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.86      0.84      0.85       300
           1       0.80      0.52      0.63       216
           2       0.68      0.35      0.46        86
           3       0.44      0.09      0.15        46
           4       0.68      0.34      0.45        83
           5       0.00      0.00      0.00         0
           6       0.77      0.80      0.79       245
           7       0.71      0.24      0.36        42
           8       0.75      0.55      0.64       127
           9       1.00      0.58      0.74        12
          10       0.85      0.52      0.64       127
          11       0.89      0.57      0.70        14
          12       0.60      0.64      0.62       106
          13       0.00      0.00      0.00         0
   micro avg       0.77      0.61      0.68      1404
   macro avg       0.65      0.43      0.50      1404
 weighted avg       0.77      0.61      0.66      1404
 samples avg       0.79      0.67      0.69      1404
--- a/games_march2025_cleaned_2k_i3k/SVC.txt
+++ b/games_march2025_cleaned_2k_i3k/SVC.txt
@@ -0,0 +1,21 @@
              precision    recall  f1-score   support
           0       0.81      0.90      0.85       300
           1       0.76      0.63      0.69       216
           2       1.00      0.03      0.07        86
           3       0.00      0.00      0.00        46
           4       1.00      0.05      0.09        83
           5       0.00      0.00      0.00         0
           6       0.77      0.83      0.80       245
           7       0.00      0.00      0.00        42
           8       0.84      0.40      0.54       127
           9       1.00      0.17      0.29        12
          10       0.90      0.34      0.49       127
          11       1.00      0.14      0.25        14
          12       0.92      0.21      0.34       106
          13       0.00      0.00      0.00         0
   micro avg       0.80      0.53      0.63      1404
   macro avg       0.64      0.26      0.32      1404
 weighted avg       0.79      0.53      0.56      1404
 samples avg       0.79      0.59      0.63      1404
--- a/notebook.ipynb
+++ b/notebook.ipynb
@@ -304,13 +304,47 @@
    "``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n",
    "As such, we tested many different models on the 2k Dataset and chose the 5 best performing ones for the big dataset.\n",
    "\n",
-    "### The comparison\n",
+    "### Initial Comparison\n",
-    "We won't put the comparison script in this notebook, but you can find it in the ``compare_models.py`` file and try it out yourself.\n",
+    "We won't put the comparison script in this notebook, but you can find it in the ``compare_models_2k.py`` file and try it out yourself.\n",
    "There were some rules as a baseline for comparison:\n",
    "- All Hyperparameters are set to default\n",
-    "- All iteration limits are set to 3000\n",
+    "- All iteration limits are set to 3000 (exception: MLPClassifier with 300, where i-limit are epochs instead of iterations )\n",
    "- All ``random_state``s are set to 0\n",
    "\n",
-    "![Comparison Image](./compare_models_2k.png)"
+    "Running all models with that configuration yields the following weighted F1-Scores (results as seen in the ``games_march2025_cleaned_2k_i3k`` folder): \n",
    "\n",
    "![Comparison Image 2k](./compare_models_2k.png)\n",
    "\n",
    "If we also compare Micro/Macro values, we see that all models have a much lower Macro-F1 than Micro/Weighted-F1. That is because the 2k Dataset does not contain enough datapoints for every class (test data for 2 classes is 0), so we should proceed to the 10k Dataset before making major choices.\n",
    "\n",
    "![Comparison Image 2k Micro/Macro/Weighted](./compare_models_2k_3.png)\n",
    "\n",
    "The 10 best performing models which will run on the 10k Dataset with the same rules as before:\n",
    "1. NearestCentroid\n",
    "2. Perceptron\n",
    "3. PassiveAggressiveClassifier\n",
    "4. LinearSVC\n",
    "5. SDGClassifer\n",
    "6. HistGradientBoostingClassifier\n",
    "7. MLPClassifier\n",
    "8. RidgeClassifier\n",
    "9. GradientBoostingClassifier\n",
    "10. LinearDiscriminationAnalysis\n",
    "\n",
    "![Comparison Image 10k](./compare_models_10k.png)\n",
    "\n",
    "We can also compare these models between datasets, to see if a bigger dataset always improves the performance.\n",
    "\n",
    "![Comparison Image between 2k and 10k](./compare_models_2k_10k.png)\n",
    "\n",
    "The final contenders are:\n",
    "1.\n",
    "2.\n",
    "3.\n",
    "4.\n",
    "5.\n",
    "\n",
    "..."
   ]
  },
  {