diff --git a/compare_dataset_sizes.png b/compare_dataset_sizes.png deleted file mode 100644 index 0f8cbbb..0000000 Binary files a/compare_dataset_sizes.png and /dev/null differ diff --git a/compare_dataset_sizes.py b/compare_graph_maker.py similarity index 58% rename from compare_dataset_sizes.py rename to compare_graph_maker.py index 251d29b..7d6c02d 100644 --- a/compare_dataset_sizes.py +++ b/compare_graph_maker.py @@ -22,17 +22,20 @@ for dataset_name, folder in datasets.items(): results[dataset_name][model_name] = f1_score # Plot -models = sorted(results["cleaned"].keys()) # alphabetisch sortieren für gleiche Reihenfolge +#models = sorted(results["cleaned_2k"].keys()) # alphabetisch sortieren für gleiche Reihenfolge +models = dict(sorted(results["cleaned_2k"].items(), key=lambda i: i[1], reverse=True)) # nach values sortieren x = range(len(models)) plt.figure(figsize=(12,6)) -plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned") -plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.25, label="cleaned_2k") -plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k") +#plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned") +plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.5)#, label="cleaned_2k") +#plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k") -plt.xticks(x, models, rotation=45) -plt.ylabel("F1-Score") +plt.xticks(x, models, rotation=90) +plt.ylim(0, 1) # min max +plt.ylabel("Weighted F1-Score") plt.title("Model Performance across Datasets") -plt.legend() +#plt.legend() plt.tight_layout() +plt.savefig('compare_graph_latest.png') plt.show() diff --git a/compare_graph_maker_3.py b/compare_graph_maker_3.py new file mode 100644 index 0000000..fa31e9e --- /dev/null +++ b/compare_graph_maker_3.py @@ -0,0 +1,59 @@ +import os +import matplotlib.pyplot as plt +import numpy as np + +datasets = { + #"cleaned": "games_march2025_cleaned", + #"cleaned_2k": "games_march2025_cleaned_2k", + #"cleaned_10k": "games_march2025_cleaned_10k" + "cleaned_2k": "games_march2025_cleaned_2k_i3k", +} +# def results +results = {} + +for dataset_name, folder in datasets.items(): + results[dataset_name] = {} + for filename in os.listdir(folder): + if filename.endswith(".txt"): + model_name = filename.replace(".txt", "") + print("model " + model_name) + results[dataset_name][model_name] = {} + with open(os.path.join(folder, filename), "r") as f: + for line in f: + if line.strip().startswith("micro avg"): + print("micro") + results[dataset_name][model_name][0] = float(line.split()[4]) # micro f1 + if line.strip().startswith("macro avg"): + print("macro") + results[dataset_name][model_name][1] = float(line.split()[4]) # macro f1 + if line.strip().startswith("weighted avg"): + print("weight") + results[dataset_name][model_name][2] = float(line.split()[4]) # weighted avg f1 + +# Plot +#models = sorted(results["cleaned_2k"].keys()) # alphabetisch sortieren für gleiche Reihenfolge +models = dict(sorted(results["cleaned_2k"].items(), key=lambda i: i[1][2], reverse=True)) # nach values sortieren +print(models) +x = range(len(models)) + +fig = plt.figure() +#ax = fig.add_subplot(projection='3d') + +plt.bar([i - 0.25 for i in x], [results["cleaned_2k"][m][0] for m in models], width=0.25, label="Micro") +plt.bar(x, [results["cleaned_2k"][m][1] for m in models], width=0.25, label="Macro") +plt.bar([i + 0.25 for i in x], [results["cleaned_2k"][m][2] for m in models], width=0.25, label="Weighted") + +plt.xticks(x, models, rotation=90) +plt.ylabel("F1 Score") +#ax.set_zlabel("F1 Value") +plt.ylim(0,1) +plt.title("Model Performance - 2k Dataset") +plt.legend() +plt.tight_layout() +plt.savefig('compare_graph_latest_3.png') +plt.show() + +# On the y-axis let's only label the discrete values that we have data for. +#ax.set_yticks(yticks) + +plt.show() \ No newline at end of file diff --git a/compare_models_10k.py b/compare_models_10k.py new file mode 100644 index 0000000..dbd3121 --- /dev/null +++ b/compare_models_10k.py @@ -0,0 +1,126 @@ +import os +import numpy as np +import pandas as pd +from sklearn import set_config + +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer +import ast +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.multioutput import MultiOutputClassifier +from sklearn.metrics import classification_report +from sklearn.model_selection import train_test_split +from sklearn.datasets import load_iris +from sklearn.metrics import accuracy_score, classification_report +from sklearn.svm import SVC, LinearSVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier +from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, SGDClassifier +from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, StackingClassifier +from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.dummy import DummyClassifier +from sklearn.neural_network import MLPClassifier + +set_config(transform_output="pandas") # dataframe supremacy + +jobs = 12 +max_iter = 3000 + +def prepDataset(dataset): #returns X_train, X_test, y_train, y_test + dataset = pd.read_csv(dataset,sep=",") + # desc, genres, tags + column_transformer = ColumnTransformer([ + # merge all descriptions + ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")), + ['detailed_description', 'about_the_game', 'short_description']), + ('pass', 'passthrough', ['genres']),#, 'tags' + ], + verbose_feature_names_out=False + ) + dataset = column_transformer.fit_transform(dataset) + #### SET MISSING VALUES + print("SETMISS") + # Setting missing numeric values to the mean + dataset.fillna(dataset.mean(numeric_only=True), inplace=True) + # Setting missing text values to 'Unknown' + dataset.fillna('', inplace=True) + # Setting missing values in other columns to NaN + dataset.dropna(inplace=True) + ##### STRUCTURIZE GENRES to onehot + #serialize array + dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) + #print(dataset['genres']) # in py but not yet onehotenc + # MultiLabelBinarizer does onehotenc for arrays + mlb_genres = MultiLabelBinarizer() + genres_encoded = mlb_genres.fit_transform(dataset.pop('genres')) + #genres_count = len(mlb_genres.classes_) # for multi-label classifiction later + genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_) + #print(genres_df) + #dataset = pd.concat([dataset, genres_df], axis=1) + #print(dataset) + #### convert text to bag of words + ## Count vs Tfidf vectorizer + vectorizer = TfidfVectorizer() + tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix + tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()) + #print(tfidf_df) + ##### MODEL + print("MODEL") + X = tfidf_df + y = genres_df + # cleanup datapoints that dont have a target value (all target columns are 0) + mask = y.sum(axis=1).map(lambda x: x > 0) + #print((mask == False).sum()) #31 cases with all target columns 0 + X_clean = X[mask] + y_clean = y[mask] + # Split dataset + return train_test_split(X_clean, y_clean, random_state=0) +def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report + multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0) + # model training + multi_target_clf.fit(X_train, y_train) + # predict against test data + y_pred = multi_target_clf.predict(X_test) + return classification_report(y_test, y_pred, zero_division=0.0) +datasets = [ + #'games_march2025_cleaned_2k.csv', + 'games_march2025_cleaned_10k.csv', + #'games_march2025_cleaned.csv' +] +estimators = { + "RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter), + "PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter), + "Perceptron": Perceptron(random_state=0, max_iter=max_iter), + "SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter), + "NearestCentroid": NearestCentroid(), + "LinearSVC": LinearSVC(random_state=0, max_iter=max_iter), + "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0), + "HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter), + "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(), + "MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/20), early_stopping=True), +} + +#"VotingClassifier": VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]), +#"StackingClassifier": StackingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]), +for dataset in datasets: + print("-" * 60) + print("dataset -> " + dataset) + print("mkdir") + folder = dataset.split(".csv")[0] + if not os.path.isdir(folder): + os.mkdir(folder) + X_train, X_test, y_train, y_test = prepDataset(dataset) + for esti in estimators: + print("model: " + esti) + compari = comparison(X_train, X_test, y_train, y_test, estimators[esti]) + print("open") + f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8") + f.write(compari) + print("write") + f.close() + print("close") \ No newline at end of file diff --git a/compare_models_2k.png b/compare_models_2k.png index d85904f..5e46552 100644 Binary files a/compare_models_2k.png and b/compare_models_2k.png differ diff --git a/generate_compare_dataset.py b/compare_models_2k.py similarity index 92% rename from generate_compare_dataset.py rename to compare_models_2k.py index d5cd4d5..ef4804d 100644 --- a/generate_compare_dataset.py +++ b/compare_models_2k.py @@ -27,6 +27,10 @@ from sklearn.dummy import DummyClassifier from sklearn.neural_network import MLPClassifier set_config(transform_output="pandas") # dataframe supremacy + +jobs = 12 +max_iter = 3000 + def prepDataset(dataset): #returns X_train, X_test, y_train, y_test dataset = pd.read_csv(dataset,sep=",") # desc, genres, tags @@ -76,7 +80,7 @@ def prepDataset(dataset): #returns X_train, X_test, y_train, y_test y_clean = y[mask] # Split dataset return train_test_split(X_clean, y_clean, random_state=0) -def comparison(X_train, X_test, y_train, y_test, estimator, jobs: int = 1): #returns class_report +def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0) # model training multi_target_clf.fit(X_train, y_train) @@ -88,9 +92,6 @@ datasets = [ #'games_march2025_cleaned_10k.csv', #'games_march2025_cleaned.csv' ] - -max_iter = 3000 # <-- set your desired value here - estimators = { "LogisticRegression": LogisticRegression(random_state=0, max_iter=max_iter), "RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter), @@ -99,8 +100,8 @@ estimators = { "SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter), "KNeighborsClassifier": KNeighborsClassifier(), "NearestCentroid": NearestCentroid(), - "RadiusNeighborsClassifier": RadiusNeighborsClassifier(), - "LinearSVC-i5000": LinearSVC(random_state=0, max_iter=max_iter), + # "RadiusNeighborsClassifier": RadiusNeighborsClassifier(), # failed bcs no neighbours in range :sob: + "LinearSVC": LinearSVC(random_state=0, max_iter=max_iter), "SVC": SVC(random_state=0, max_iter=max_iter), "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0), "RandomForestClassifier": RandomForestClassifier(random_state=0), @@ -114,8 +115,7 @@ estimators = { "BernoulliNB": BernoulliNB(), "ComplementNB": ComplementNB(), "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(), - "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(), - "MLPClassifier-i10000": MLPClassifier(max_iter=10000, random_state=0), + "MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/5), verbose=True), "DummyClassifier": DummyClassifier(random_state=0) } @@ -131,7 +131,7 @@ for dataset in datasets: X_train, X_test, y_train, y_test = prepDataset(dataset) for esti in estimators: print("model: " + esti) - compari = comparison(X_train, X_test, y_train, y_test, estimators[esti], 1) #TODO: change the job count if you can + compari = comparison(X_train, X_test, y_train, y_test, estimators[esti]) print("open") f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8") f.write(compari) diff --git a/compare_models_2k_3.png b/compare_models_2k_3.png new file mode 100644 index 0000000..83293e3 Binary files /dev/null and b/compare_models_2k_3.png differ diff --git a/games_march2025_cleaned_2k_i3k/AdaBoostClassifier.txt b/games_march2025_cleaned_2k_i3k/AdaBoostClassifier.txt new file mode 100644 index 0000000..92d96db --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/AdaBoostClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.87 0.76 0.81 300 + 1 0.70 0.59 0.64 216 + 2 0.58 0.13 0.21 86 + 3 0.56 0.11 0.18 46 + 4 0.71 0.30 0.42 83 + 5 0.00 0.00 0.00 0 + 6 0.69 0.70 0.69 245 + 7 0.62 0.31 0.41 42 + 8 0.76 0.41 0.53 127 + 9 1.00 0.50 0.67 12 + 10 0.67 0.50 0.57 127 + 11 0.40 0.29 0.33 14 + 12 0.74 0.45 0.56 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.74 0.54 0.62 1404 + macro avg 0.59 0.36 0.43 1404 +weighted avg 0.73 0.54 0.60 1404 + samples avg 0.74 0.59 0.61 1404 diff --git a/games_march2025_cleaned_2k_i3k/BaggingClassifier.txt b/games_march2025_cleaned_2k_i3k/BaggingClassifier.txt new file mode 100644 index 0000000..b0c0e5b --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/BaggingClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.86 0.70 0.77 300 + 1 0.72 0.50 0.59 216 + 2 0.47 0.09 0.16 86 + 3 0.50 0.04 0.08 46 + 4 0.58 0.23 0.33 83 + 5 0.00 0.00 0.00 0 + 6 0.71 0.64 0.67 245 + 7 0.80 0.29 0.42 42 + 8 0.79 0.46 0.58 127 + 9 1.00 0.25 0.40 12 + 10 0.71 0.43 0.53 127 + 11 0.40 0.29 0.33 14 + 12 0.68 0.42 0.52 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.74 0.49 0.59 1404 + macro avg 0.59 0.31 0.39 1404 +weighted avg 0.72 0.49 0.56 1404 + samples avg 0.70 0.54 0.57 1404 diff --git a/games_march2025_cleaned_2k_i3k/BernoulliNB.txt b/games_march2025_cleaned_2k_i3k/BernoulliNB.txt new file mode 100644 index 0000000..f2237d4 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/BernoulliNB.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.75 0.90 0.82 300 + 1 0.72 0.68 0.70 216 + 2 0.50 0.08 0.14 86 + 3 0.27 0.07 0.11 46 + 4 0.40 0.07 0.12 83 + 5 0.00 0.00 0.00 0 + 6 0.77 0.82 0.79 245 + 7 0.33 0.10 0.15 42 + 8 0.67 0.40 0.50 127 + 9 0.00 0.00 0.00 12 + 10 0.71 0.37 0.49 127 + 11 0.00 0.00 0.00 14 + 12 0.49 0.31 0.38 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.70 0.55 0.62 1404 + macro avg 0.40 0.27 0.30 1404 +weighted avg 0.64 0.55 0.56 1404 + samples avg 0.73 0.59 0.61 1404 diff --git a/games_march2025_cleaned_2k_i3k/ComplementNB.txt b/games_march2025_cleaned_2k_i3k/ComplementNB.txt new file mode 100644 index 0000000..cc19e24 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/ComplementNB.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.67 0.98 0.80 300 + 1 0.81 0.36 0.50 216 + 2 0.67 0.05 0.09 86 + 3 0.00 0.00 0.00 46 + 4 0.80 0.05 0.09 83 + 5 0.00 0.00 0.00 0 + 6 0.77 0.81 0.79 245 + 7 0.40 0.05 0.09 42 + 8 0.83 0.04 0.08 127 + 9 0.00 0.00 0.00 12 + 10 0.43 0.02 0.04 127 + 11 0.00 0.00 0.00 14 + 12 1.00 0.05 0.09 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.70 0.42 0.53 1404 + macro avg 0.46 0.17 0.18 1404 +weighted avg 0.69 0.42 0.42 1404 + samples avg 0.71 0.46 0.52 1404 diff --git a/games_march2025_cleaned_2k_i3k/DecisionTreeClassifier.txt b/games_march2025_cleaned_2k_i3k/DecisionTreeClassifier.txt new file mode 100644 index 0000000..900c256 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/DecisionTreeClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.76 0.73 0.75 300 + 1 0.56 0.53 0.54 216 + 2 0.36 0.33 0.34 86 + 3 0.33 0.26 0.29 46 + 4 0.40 0.46 0.43 83 + 5 0.00 0.00 0.00 0 + 6 0.65 0.61 0.63 245 + 7 0.39 0.40 0.40 42 + 8 0.59 0.57 0.58 127 + 9 0.60 0.25 0.35 12 + 10 0.56 0.51 0.53 127 + 11 0.39 0.50 0.44 14 + 12 0.52 0.49 0.50 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.58 0.55 0.57 1404 + macro avg 0.44 0.40 0.41 1404 +weighted avg 0.58 0.55 0.57 1404 + samples avg 0.59 0.59 0.55 1404 diff --git a/games_march2025_cleaned_2k_i3k/DummyClassifier.txt b/games_march2025_cleaned_2k_i3k/DummyClassifier.txt new file mode 100644 index 0000000..97bf276 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/DummyClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.60 1.00 0.75 300 + 1 0.00 0.00 0.00 216 + 2 0.00 0.00 0.00 86 + 3 0.00 0.00 0.00 46 + 4 0.00 0.00 0.00 83 + 5 0.00 0.00 0.00 0 + 6 0.00 0.00 0.00 245 + 7 0.00 0.00 0.00 42 + 8 0.00 0.00 0.00 127 + 9 0.00 0.00 0.00 12 + 10 0.00 0.00 0.00 127 + 11 0.00 0.00 0.00 14 + 12 0.00 0.00 0.00 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.60 0.21 0.32 1404 + macro avg 0.04 0.07 0.05 1404 +weighted avg 0.13 0.21 0.16 1404 + samples avg 0.60 0.26 0.34 1404 diff --git a/games_march2025_cleaned_2k_i3k/ExtraTreesClassifier.txt b/games_march2025_cleaned_2k_i3k/ExtraTreesClassifier.txt new file mode 100644 index 0000000..9536dc4 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/ExtraTreesClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.81 0.91 0.86 300 + 1 0.78 0.62 0.69 216 + 2 1.00 0.03 0.07 86 + 3 0.00 0.00 0.00 46 + 4 1.00 0.04 0.07 83 + 5 0.00 0.00 0.00 0 + 6 0.78 0.73 0.75 245 + 7 0.00 0.00 0.00 42 + 8 0.84 0.24 0.38 127 + 9 1.00 0.17 0.29 12 + 10 0.90 0.21 0.34 127 + 11 1.00 0.14 0.25 14 + 12 0.83 0.18 0.29 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.80 0.48 0.60 1404 + macro avg 0.64 0.23 0.29 1404 +weighted avg 0.79 0.48 0.52 1404 + samples avg 0.78 0.54 0.60 1404 diff --git a/games_march2025_cleaned_2k_i3k/GaussianNB.txt b/games_march2025_cleaned_2k_i3k/GaussianNB.txt new file mode 100644 index 0000000..83d7a2e --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/GaussianNB.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.76 0.80 0.78 300 + 1 0.62 0.51 0.56 216 + 2 0.63 0.14 0.23 86 + 3 0.17 0.02 0.04 46 + 4 0.42 0.10 0.16 83 + 5 0.00 0.00 0.00 0 + 6 0.68 0.66 0.67 245 + 7 0.56 0.12 0.20 42 + 8 0.55 0.33 0.41 127 + 9 0.67 0.17 0.27 12 + 10 0.65 0.31 0.42 127 + 11 1.00 0.14 0.25 14 + 12 0.53 0.29 0.38 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.66 0.47 0.55 1404 + macro avg 0.52 0.26 0.31 1404 +weighted avg 0.62 0.47 0.51 1404 + samples avg 0.67 0.53 0.55 1404 diff --git a/games_march2025_cleaned_2k_i3k/GradientBoostingClassifier.txt b/games_march2025_cleaned_2k_i3k/GradientBoostingClassifier.txt new file mode 100644 index 0000000..597ff29 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/GradientBoostingClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.87 0.80 0.83 300 + 1 0.77 0.61 0.68 216 + 2 0.55 0.13 0.21 86 + 3 0.42 0.11 0.17 46 + 4 0.68 0.33 0.44 83 + 5 0.00 0.00 0.00 0 + 6 0.71 0.76 0.74 245 + 7 0.61 0.26 0.37 42 + 8 0.81 0.50 0.61 127 + 9 0.75 0.25 0.38 12 + 10 0.81 0.54 0.65 127 + 11 0.40 0.43 0.41 14 + 12 0.69 0.42 0.53 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.76 0.57 0.65 1404 + macro avg 0.58 0.37 0.43 1404 +weighted avg 0.74 0.57 0.63 1404 + samples avg 0.77 0.63 0.65 1404 diff --git a/games_march2025_cleaned_2k_i3k/HistGradientBoostingClassifier.txt b/games_march2025_cleaned_2k_i3k/HistGradientBoostingClassifier.txt new file mode 100644 index 0000000..2dba977 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/HistGradientBoostingClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.83 0.83 0.83 300 + 1 0.74 0.69 0.72 216 + 2 0.80 0.28 0.41 86 + 3 1.00 0.04 0.08 46 + 4 0.70 0.39 0.50 83 + 5 0.00 0.00 0.00 0 + 6 0.72 0.76 0.74 245 + 7 0.73 0.19 0.30 42 + 8 0.85 0.59 0.70 127 + 9 1.00 0.33 0.50 12 + 10 0.78 0.54 0.64 127 + 11 0.43 0.21 0.29 14 + 12 0.77 0.52 0.62 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.78 0.61 0.68 1404 + macro avg 0.67 0.38 0.45 1404 +weighted avg 0.78 0.61 0.66 1404 + samples avg 0.79 0.67 0.69 1404 diff --git a/games_march2025_cleaned_2k_i3k/KNeighborsClassifier.txt b/games_march2025_cleaned_2k_i3k/KNeighborsClassifier.txt new file mode 100644 index 0000000..72af340 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/KNeighborsClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.82 0.62 0.70 300 + 1 0.69 0.46 0.55 216 + 2 0.62 0.06 0.11 86 + 3 0.20 0.02 0.04 46 + 4 0.72 0.16 0.26 83 + 5 0.00 0.00 0.00 0 + 6 0.78 0.55 0.64 245 + 7 0.38 0.12 0.18 42 + 8 0.59 0.65 0.62 127 + 9 1.00 0.67 0.80 12 + 10 0.68 0.44 0.54 127 + 11 1.00 0.29 0.44 14 + 12 0.34 0.76 0.48 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.64 0.48 0.55 1404 + macro avg 0.56 0.34 0.38 1404 +weighted avg 0.68 0.48 0.53 1404 + samples avg 0.64 0.54 0.55 1404 diff --git a/games_march2025_cleaned_2k_i3k/LinearDiscriminantAnalysis.txt b/games_march2025_cleaned_2k_i3k/LinearDiscriminantAnalysis.txt new file mode 100644 index 0000000..fce11ad --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/LinearDiscriminantAnalysis.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.63 0.68 0.66 300 + 1 0.47 0.56 0.51 216 + 2 0.27 0.59 0.37 86 + 3 0.06 0.28 0.10 46 + 4 0.21 0.52 0.30 83 + 5 0.00 0.00 0.00 0 + 6 0.63 0.67 0.65 245 + 7 0.06 0.29 0.10 42 + 8 0.28 0.52 0.36 127 + 9 0.03 0.42 0.06 12 + 10 0.29 0.52 0.38 127 + 11 0.04 0.43 0.07 14 + 12 0.53 0.44 0.48 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.30 0.57 0.39 1404 + macro avg 0.25 0.42 0.29 1404 +weighted avg 0.44 0.57 0.48 1404 + samples avg 0.42 0.62 0.40 1404 diff --git a/games_march2025_cleaned_2k_i3k/LinearSVC.txt b/games_march2025_cleaned_2k_i3k/LinearSVC.txt new file mode 100644 index 0000000..df82b40 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/LinearSVC.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.85 0.87 0.86 300 + 1 0.76 0.66 0.70 216 + 2 0.77 0.20 0.31 86 + 3 0.00 0.00 0.00 46 + 4 0.76 0.27 0.39 83 + 5 0.00 0.00 0.00 0 + 6 0.78 0.81 0.79 245 + 7 0.89 0.19 0.31 42 + 8 0.77 0.60 0.67 127 + 9 1.00 0.58 0.74 12 + 10 0.85 0.54 0.66 127 + 11 1.00 0.29 0.44 14 + 12 0.82 0.42 0.56 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.80 0.61 0.69 1404 + macro avg 0.66 0.39 0.46 1404 +weighted avg 0.78 0.61 0.66 1404 + samples avg 0.81 0.67 0.69 1404 diff --git a/games_march2025_cleaned_2k_i3k/LogisticRegression.txt b/games_march2025_cleaned_2k_i3k/LogisticRegression.txt new file mode 100644 index 0000000..b7926d4 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/LogisticRegression.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.78 0.91 0.84 300 + 1 0.78 0.62 0.69 216 + 2 1.00 0.03 0.07 86 + 3 0.00 0.00 0.00 46 + 4 1.00 0.04 0.07 83 + 5 0.00 0.00 0.00 0 + 6 0.79 0.81 0.80 245 + 7 0.00 0.00 0.00 42 + 8 0.90 0.34 0.49 127 + 9 0.00 0.00 0.00 12 + 10 0.89 0.25 0.39 127 + 11 0.00 0.00 0.00 14 + 12 0.88 0.14 0.24 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.79 0.50 0.61 1404 + macro avg 0.50 0.22 0.26 1404 +weighted avg 0.77 0.50 0.53 1404 + samples avg 0.77 0.56 0.60 1404 diff --git a/games_march2025_cleaned_2k_i3k/MLPClassifier.txt b/games_march2025_cleaned_2k_i3k/MLPClassifier.txt new file mode 100644 index 0000000..c4634dc --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/MLPClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.84 0.85 0.84 300 + 1 0.73 0.67 0.70 216 + 2 0.74 0.30 0.43 86 + 3 0.50 0.02 0.04 46 + 4 0.69 0.24 0.36 83 + 5 0.00 0.00 0.00 0 + 6 0.79 0.79 0.79 245 + 7 0.86 0.14 0.24 42 + 8 0.76 0.63 0.69 127 + 9 1.00 0.33 0.50 12 + 10 0.81 0.52 0.63 127 + 11 1.00 0.14 0.25 14 + 12 0.75 0.41 0.53 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.79 0.60 0.68 1404 + macro avg 0.68 0.36 0.43 1404 +weighted avg 0.78 0.60 0.65 1404 + samples avg 0.80 0.66 0.68 1404 diff --git a/games_march2025_cleaned_2k_i3k/MultinomialNB.txt b/games_march2025_cleaned_2k_i3k/MultinomialNB.txt new file mode 100644 index 0000000..bc74cf3 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/MultinomialNB.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.64 0.99 0.78 300 + 1 0.85 0.24 0.37 216 + 2 0.60 0.03 0.07 86 + 3 0.00 0.00 0.00 46 + 4 0.80 0.05 0.09 83 + 5 0.00 0.00 0.00 0 + 6 0.78 0.80 0.79 245 + 7 0.40 0.05 0.09 42 + 8 1.00 0.04 0.08 127 + 9 0.00 0.00 0.00 12 + 10 0.20 0.01 0.02 127 + 11 0.00 0.00 0.00 14 + 12 1.00 0.05 0.09 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.69 0.40 0.51 1404 + macro avg 0.45 0.16 0.17 1404 +weighted avg 0.68 0.40 0.39 1404 + samples avg 0.70 0.44 0.50 1404 diff --git a/games_march2025_cleaned_2k_i3k/NearestCentroid.txt b/games_march2025_cleaned_2k_i3k/NearestCentroid.txt new file mode 100644 index 0000000..c1de0ab --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/NearestCentroid.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.83 0.75 0.79 300 + 1 0.65 0.75 0.70 216 + 2 0.43 0.72 0.54 86 + 3 0.18 0.33 0.23 46 + 4 0.46 0.61 0.53 83 + 5 0.00 0.00 0.00 0 + 6 0.74 0.76 0.75 245 + 7 0.31 0.62 0.41 42 + 8 0.47 0.69 0.55 127 + 9 1.00 0.67 0.80 12 + 10 0.59 0.69 0.64 127 + 11 0.60 0.64 0.62 14 + 12 0.42 0.66 0.52 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.57 0.70 0.63 1404 + macro avg 0.48 0.56 0.50 1404 +weighted avg 0.62 0.70 0.65 1404 + samples avg 0.63 0.74 0.64 1404 diff --git a/games_march2025_cleaned_2k_i3k/PassiveAggressiveClassifier.txt b/games_march2025_cleaned_2k_i3k/PassiveAggressiveClassifier.txt new file mode 100644 index 0000000..949c768 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/PassiveAggressiveClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.84 0.86 0.85 300 + 1 0.74 0.63 0.68 216 + 2 0.77 0.31 0.45 86 + 3 0.50 0.04 0.08 46 + 4 0.69 0.33 0.44 83 + 5 0.00 0.00 0.00 0 + 6 0.79 0.80 0.79 245 + 7 0.69 0.26 0.38 42 + 8 0.74 0.62 0.68 127 + 9 1.00 0.67 0.80 12 + 10 0.80 0.57 0.67 127 + 11 1.00 0.50 0.67 14 + 12 0.79 0.46 0.58 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.79 0.62 0.69 1404 + macro avg 0.67 0.43 0.50 1404 +weighted avg 0.77 0.62 0.67 1404 + samples avg 0.80 0.68 0.70 1404 diff --git a/games_march2025_cleaned_2k_i3k/Perceptron.txt b/games_march2025_cleaned_2k_i3k/Perceptron.txt new file mode 100644 index 0000000..acdc33d --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/Perceptron.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.78 0.94 0.85 300 + 1 0.60 0.88 0.71 216 + 2 0.54 0.60 0.57 86 + 3 0.33 0.04 0.08 46 + 4 0.68 0.16 0.25 83 + 5 0.00 0.00 0.00 0 + 6 0.74 0.86 0.80 245 + 7 0.63 0.29 0.39 42 + 8 0.62 0.80 0.69 127 + 9 1.00 0.67 0.80 12 + 10 0.89 0.43 0.58 127 + 11 0.70 0.50 0.58 14 + 12 0.88 0.27 0.42 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.70 0.68 0.69 1404 + macro avg 0.60 0.46 0.48 1404 +weighted avg 0.71 0.68 0.66 1404 + samples avg 0.72 0.74 0.69 1404 diff --git a/games_march2025_cleaned_2k_i3k/RandomForestClassifier.txt b/games_march2025_cleaned_2k_i3k/RandomForestClassifier.txt new file mode 100644 index 0000000..6fbe546 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/RandomForestClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.80 0.88 0.84 300 + 1 0.78 0.55 0.64 216 + 2 1.00 0.03 0.07 86 + 3 0.00 0.00 0.00 46 + 4 1.00 0.06 0.11 83 + 5 0.00 0.00 0.00 0 + 6 0.74 0.78 0.76 245 + 7 0.00 0.00 0.00 42 + 8 0.84 0.24 0.38 127 + 9 0.00 0.00 0.00 12 + 10 0.91 0.24 0.38 127 + 11 1.00 0.14 0.25 14 + 12 1.00 0.25 0.39 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.79 0.48 0.59 1404 + macro avg 0.58 0.23 0.27 1404 +weighted avg 0.78 0.48 0.52 1404 + samples avg 0.77 0.54 0.60 1404 diff --git a/games_march2025_cleaned_2k_i3k/RidgeClassifier.txt b/games_march2025_cleaned_2k_i3k/RidgeClassifier.txt new file mode 100644 index 0000000..81250aa --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/RidgeClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.84 0.88 0.86 300 + 1 0.76 0.66 0.70 216 + 2 0.80 0.14 0.24 86 + 3 0.00 0.00 0.00 46 + 4 0.85 0.20 0.33 83 + 5 0.00 0.00 0.00 0 + 6 0.78 0.82 0.80 245 + 7 0.86 0.14 0.24 42 + 8 0.79 0.54 0.64 127 + 9 1.00 0.42 0.59 12 + 10 0.88 0.50 0.64 127 + 11 1.00 0.14 0.25 14 + 12 0.83 0.38 0.52 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.81 0.59 0.68 1404 + macro avg 0.67 0.34 0.42 1404 +weighted avg 0.79 0.59 0.63 1404 + samples avg 0.81 0.65 0.68 1404 diff --git a/games_march2025_cleaned_2k_i3k/SGDClassifier.txt b/games_march2025_cleaned_2k_i3k/SGDClassifier.txt new file mode 100644 index 0000000..ade3141 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/SGDClassifier.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.86 0.84 0.85 300 + 1 0.80 0.52 0.63 216 + 2 0.68 0.35 0.46 86 + 3 0.44 0.09 0.15 46 + 4 0.68 0.34 0.45 83 + 5 0.00 0.00 0.00 0 + 6 0.77 0.80 0.79 245 + 7 0.71 0.24 0.36 42 + 8 0.75 0.55 0.64 127 + 9 1.00 0.58 0.74 12 + 10 0.85 0.52 0.64 127 + 11 0.89 0.57 0.70 14 + 12 0.60 0.64 0.62 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.77 0.61 0.68 1404 + macro avg 0.65 0.43 0.50 1404 +weighted avg 0.77 0.61 0.66 1404 + samples avg 0.79 0.67 0.69 1404 diff --git a/games_march2025_cleaned_2k_i3k/SVC.txt b/games_march2025_cleaned_2k_i3k/SVC.txt new file mode 100644 index 0000000..ff0c7b7 --- /dev/null +++ b/games_march2025_cleaned_2k_i3k/SVC.txt @@ -0,0 +1,21 @@ + precision recall f1-score support + + 0 0.81 0.90 0.85 300 + 1 0.76 0.63 0.69 216 + 2 1.00 0.03 0.07 86 + 3 0.00 0.00 0.00 46 + 4 1.00 0.05 0.09 83 + 5 0.00 0.00 0.00 0 + 6 0.77 0.83 0.80 245 + 7 0.00 0.00 0.00 42 + 8 0.84 0.40 0.54 127 + 9 1.00 0.17 0.29 12 + 10 0.90 0.34 0.49 127 + 11 1.00 0.14 0.25 14 + 12 0.92 0.21 0.34 106 + 13 0.00 0.00 0.00 0 + + micro avg 0.80 0.53 0.63 1404 + macro avg 0.64 0.26 0.32 1404 +weighted avg 0.79 0.53 0.56 1404 + samples avg 0.79 0.59 0.63 1404 diff --git a/notebook.ipynb b/notebook.ipynb index d1afe52..00fc10c 100644 --- a/notebook.ipynb +++ b/notebook.ipynb @@ -304,13 +304,47 @@ "``sklearn`` has many different classification Models to choose from, but we only have limited time and computing power.\n", "As such, we tested many different models on the 2k Dataset and chose the 5 best performing ones for the big dataset.\n", "\n", - "### The comparison\n", - "We won't put the comparison script in this notebook, but you can find it in the ``compare_models.py`` file and try it out yourself.\n", + "### Initial Comparison\n", + "We won't put the comparison script in this notebook, but you can find it in the ``compare_models_2k.py`` file and try it out yourself.\n", "There were some rules as a baseline for comparison:\n", "- All Hyperparameters are set to default\n", - "- All iteration limits are set to 3000\n", + "- All iteration limits are set to 3000 (exception: MLPClassifier with 300, where i-limit are epochs instead of iterations )\n", + "- All ``random_state``s are set to 0\n", "\n", - "![Comparison Image](./compare_models_2k.png)" + "Running all models with that configuration yields the following weighted F1-Scores (results as seen in the ``games_march2025_cleaned_2k_i3k`` folder): \n", + "\n", + "![Comparison Image 2k](./compare_models_2k.png)\n", + "\n", + "If we also compare Micro/Macro values, we see that all models have a much lower Macro-F1 than Micro/Weighted-F1. That is because the 2k Dataset does not contain enough datapoints for every class (test data for 2 classes is 0), so we should proceed to the 10k Dataset before making major choices.\n", + "\n", + "![Comparison Image 2k Micro/Macro/Weighted](./compare_models_2k_3.png)\n", + "\n", + "The 10 best performing models which will run on the 10k Dataset with the same rules as before:\n", + "1. NearestCentroid\n", + "2. Perceptron\n", + "3. PassiveAggressiveClassifier\n", + "4. LinearSVC\n", + "5. SDGClassifer\n", + "6. HistGradientBoostingClassifier\n", + "7. MLPClassifier\n", + "8. RidgeClassifier\n", + "9. GradientBoostingClassifier\n", + "10. LinearDiscriminationAnalysis\n", + "\n", + "![Comparison Image 10k](./compare_models_10k.png)\n", + "\n", + "We can also compare these models between datasets, to see if a bigger dataset always improves the performance.\n", + "\n", + "![Comparison Image between 2k and 10k](./compare_models_2k_10k.png)\n", + "\n", + "The final contenders are:\n", + "1.\n", + "2.\n", + "3.\n", + "4.\n", + "5.\n", + "\n", + "..." ] }, {