import os import numpy as np import pandas as pd from sklearn import set_config from sklearn.compose import ColumnTransformer from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer import ast from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.multioutput import MultiOutputClassifier from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score, classification_report from sklearn.svm import SVC, LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, SGDClassifier from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, StackingClassifier from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.dummy import DummyClassifier from sklearn.neural_network import MLPClassifier set_config(transform_output="pandas") # dataframe supremacy jobs = 12 max_iter = 3000 def prepDataset(dataset): #returns X_train, X_test, y_train, y_test dataset = pd.read_csv(dataset,sep=",") # desc, genres, tags column_transformer = ColumnTransformer([ # merge all descriptions ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")), ['detailed_description', 'about_the_game', 'short_description']), ('pass', 'passthrough', ['genres']),#, 'tags' ], verbose_feature_names_out=False ) dataset = column_transformer.fit_transform(dataset) #### SET MISSING VALUES print("SETMISS") # Setting missing numeric values to the mean dataset.fillna(dataset.mean(numeric_only=True), inplace=True) # Setting missing text values to 'Unknown' dataset.fillna('', inplace=True) # Setting missing values in other columns to NaN dataset.dropna(inplace=True) ##### STRUCTURIZE GENRES to onehot #serialize array dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) #print(dataset['genres']) # in py but not yet onehotenc # MultiLabelBinarizer does onehotenc for arrays mlb_genres = MultiLabelBinarizer() genres_encoded = mlb_genres.fit_transform(dataset.pop('genres')) #genres_count = len(mlb_genres.classes_) # for multi-label classifiction later genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_) #print(genres_df) #dataset = pd.concat([dataset, genres_df], axis=1) #print(dataset) #### convert text to bag of words ## Count vs Tfidf vectorizer vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()) #print(tfidf_df) ##### MODEL print("MODEL") X = tfidf_df y = genres_df # cleanup datapoints that dont have a target value (all target columns are 0) mask = y.sum(axis=1).map(lambda x: x > 0) #print((mask == False).sum()) #31 cases with all target columns 0 X_clean = X[mask] y_clean = y[mask] # Split dataset return train_test_split(X_clean, y_clean, random_state=0) def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0) # model training multi_target_clf.fit(X_train, y_train) # predict against test data y_pred = multi_target_clf.predict(X_test) return classification_report(y_test, y_pred, zero_division=0.0) datasets = [ #'games_march2025_cleaned_2k.csv', 'games_march2025_cleaned_10k.csv', #'games_march2025_cleaned.csv' ] estimators = { "RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter), "PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter), "Perceptron": Perceptron(random_state=0, max_iter=max_iter), "SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter), "NearestCentroid": NearestCentroid(), "LinearSVC": LinearSVC(random_state=0, max_iter=max_iter), "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0), "HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter), "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(), "MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/20), early_stopping=True), } #"VotingClassifier": VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]), #"StackingClassifier": StackingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]), for dataset in datasets: print("-" * 60) print("dataset -> " + dataset) print("mkdir") folder = dataset.split(".csv")[0] if not os.path.isdir(folder): os.mkdir(folder) X_train, X_test, y_train, y_test = prepDataset(dataset) for esti in estimators: print("model: " + esti) compari = comparison(X_train, X_test, y_train, y_test, estimators[esti]) print("open") f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8") f.write(compari) print("write") f.close() print("close")