first version of the plot

This commit is contained in:
Maximilian Kany
2025-08-15 11:38:46 +02:00
parent 7421171bb0
commit ee6a31972b
41 changed files with 38 additions and 1523 deletions

View File

@@ -1,140 +0,0 @@
import os
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MultiLabelBinarizer
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
set_config(transform_output="pandas") # dataframe supremacy
def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",")
# desc, genres, tags
column_transformer = ColumnTransformer([
# merge all descriptions
('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
['detailed_description', 'about_the_game', 'short_description']),
('pass', 'passthrough', ['genres']),#, 'tags'
],
verbose_feature_names_out=False
)
dataset = column_transformer.fit_transform(dataset)
#### SET MISSING VALUES
print("SETMISS")
# Setting missing numeric values to the mean
dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
# Setting missing text values to 'Unknown'
dataset.fillna('', inplace=True)
# Setting missing values in other columns to NaN
dataset.dropna(inplace=True)
##### STRUCTURIZE GENRES to onehot
#serialize array
dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))
#print(dataset['genres']) # in py but not yet onehotenc
# MultiLabelBinarizer does onehotenc for arrays
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
#genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
#print(genres_df)
#dataset = pd.concat([dataset, genres_df], axis=1)
#print(dataset)
#### convert text to bag of words
## Count vs Tfidf vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
#print(tfidf_df)
##### MODEL
print("MODEL")
X = tfidf_df
y = genres_df
# cleanup datapoints that dont have a target value (all target columns are 0)
mask = y.sum(axis=1).map(lambda x: x > 0)
#print((mask == False).sum()) #31 cases with all target columns 0
X_clean = X[mask]
y_clean = y[mask]
# Split dataset
return train_test_split(X_clean, y_clean, random_state=0)
def comparison(X_train, X_test, y_train, y_test, estimator, jobs: int = 1): #returns class_report
multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0)
# model training
multi_target_clf.fit(X_train, y_train)
# predict against test data
y_pred = multi_target_clf.predict(X_test)
return classification_report(y_test, y_pred, zero_division=0.0)
datasets = [
'games_march2025_cleaned_2k.csv',
'games_march2025_cleaned_10k.csv',
'games_march2025_cleaned.csv'
]
estimators = {
"LogisticRegression-i1000": LogisticRegression(max_iter=1000, random_state=0),
"LogisticRegression-i10000": LogisticRegression(max_iter=10000, random_state=0),
"LinearSVC-i5000": LinearSVC(max_iter=5000),
"SVC-RBF-i10000": SVC(kernel="rbf", max_iter=10000),
"DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
"RandomForestClassifier": RandomForestClassifier(random_state=0),
"GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
"GaussianNB": GaussianNB(),
"MultinomialNB": MultinomialNB(),
"BernoulliNB": BernoulliNB(),
"MLPClassifier-i10000": MLPClassifier(max_iter=10000, random_state=0),
}
for dataset in datasets:
print("-" * 60)
print("dataset -> " + dataset)
print("-" * 60)
print("mkdir")
folder = dataset.split(".csv")[0]
if not os.path.isdir(folder):
os.mkdir(folder)
X_train, X_test, y_train, y_test = prepDataset(dataset)
for esti in estimators:
compari = comparison(X_train, X_test, y_train, y_test, estimators[esti], 1) #TODO: change the job count if you can
print("open")
f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8")
f.write(compari)
print("write")
f.close()
print("close")