From ce433bb97f37de7f98e741642861c1f52a4bde53 Mon Sep 17 00:00:00 2001 From: Tim <47184194+imgde@users.noreply.github.com> Date: Wed, 13 Aug 2025 23:19:16 +0200 Subject: [PATCH] comparison script --- comparison.py | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 comparison.py diff --git a/comparison.py b/comparison.py new file mode 100644 index 0000000..fcced39 --- /dev/null +++ b/comparison.py @@ -0,0 +1,140 @@ +import os +import numpy as np +import pandas as pd +from sklearn import set_config + +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import FunctionTransformer + +from sklearn.preprocessing import MultiLabelBinarizer +import ast + + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.multioutput import MultiOutputClassifier +from sklearn.metrics import classification_report +from sklearn.model_selection import train_test_split +from sklearn.datasets import load_iris +from sklearn.metrics import accuracy_score, classification_report +from sklearn.svm import SVC, LinearSVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier +from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPClassifier + + +set_config(transform_output="pandas") # dataframe supremacy + +def prepDataset(dataset): #returns X_train, X_test, y_train, y_test + dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",") + # desc, genres, tags + column_transformer = ColumnTransformer([ + # merge all descriptions + ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")), + ['detailed_description', 'about_the_game', 'short_description']), + ('pass', 'passthrough', ['genres']),#, 'tags' + ], + verbose_feature_names_out=False + ) + dataset = column_transformer.fit_transform(dataset) + + + + #### SET MISSING VALUES + print("SETMISS") + # Setting missing numeric values to the mean + dataset.fillna(dataset.mean(numeric_only=True), inplace=True) + # Setting missing text values to 'Unknown' + dataset.fillna('', inplace=True) + # Setting missing values in other columns to NaN + dataset.dropna(inplace=True) + + ##### STRUCTURIZE GENRES to onehot + #serialize array + dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) + #print(dataset['genres']) # in py but not yet onehotenc + + # MultiLabelBinarizer does onehotenc for arrays + mlb_genres = MultiLabelBinarizer() + genres_encoded = mlb_genres.fit_transform(dataset.pop('genres')) + #genres_count = len(mlb_genres.classes_) # for multi-label classifiction later + + genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_) + #print(genres_df) + #dataset = pd.concat([dataset, genres_df], axis=1) + #print(dataset) + + + #### convert text to bag of words + + ## Count vs Tfidf vectorizer + vectorizer = TfidfVectorizer() + tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix + tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()) + #print(tfidf_df) + + + ##### MODEL + print("MODEL") + + + X = tfidf_df + y = genres_df + # cleanup datapoints that dont have a target value (all target columns are 0) + mask = y.sum(axis=1).map(lambda x: x > 0) + #print((mask == False).sum()) #31 cases with all target columns 0 + X_clean = X[mask] + y_clean = y[mask] + + # Split dataset + return train_test_split(X_clean, y_clean, random_state=0) + +def comparison(X_train, X_test, y_train, y_test, estimator, jobs: int = 1): #returns class_report + multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0) + + # model training + multi_target_clf.fit(X_train, y_train) + + # predict against test data + y_pred = multi_target_clf.predict(X_test) + return classification_report(y_test, y_pred, zero_division=0.0) + +datasets = [ + 'games_march2025_cleaned_2k.csv', + 'games_march2025_cleaned_10k.csv', + 'games_march2025_cleaned.csv' +] + +estimators = { + "LogisticRegression-i1000": LogisticRegression(max_iter=1000, random_state=0), + "LogisticRegression-i10000": LogisticRegression(max_iter=10000, random_state=0), + "LinearSVC-i5000": LinearSVC(max_iter=5000), + "SVC-RBF-i10000": SVC(kernel="rbf", max_iter=10000), + "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0), + "RandomForestClassifier": RandomForestClassifier(random_state=0), + "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0), + "GaussianNB": GaussianNB(), + "MultinomialNB": MultinomialNB(), + "BernoulliNB": BernoulliNB(), + "MLPClassifier-i10000": MLPClassifier(max_iter=10000, random_state=0), +} + +for dataset in datasets: + print("-" * 60) + print("dataset -> " + dataset) + print("-" * 60) + print("mkdir") + folder = dataset.split(".csv")[0] + if not os.path.isdir(folder): + os.mkdir(folder) + X_train, X_test, y_train, y_test = prepDataset(dataset) + for esti in estimators: + compari = comparison(X_train, X_test, y_train, y_test, estimators[esti], 1) #TODO: change the job count if you can + print("open") + f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8") + f.write(compari) + print("write") + f.close() + print("close") \ No newline at end of file