Files
machine-learning/compare_models_10k.py
2025-08-20 20:10:58 +02:00

152 lines
6.4 KiB
Python

import os
import numpy as np
import pandas as pd
from sklearn import set_config
import gc
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from sklearn.neural_network import MLPClassifier
set_config(transform_output="pandas") # dataframe supremacy
jobs = 4
max_iter = 3000
min_entries = 5
def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
dataset = pd.read_csv(dataset,sep=",")
# desc, genres, tags
column_transformer = ColumnTransformer([
# merge all descriptions
('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
['detailed_description', 'about_the_game', 'short_description']),
('pass', 'passthrough', ['genres']),#, 'tags'
],
verbose_feature_names_out=False
)
dataset = column_transformer.fit_transform(dataset)
#### SET MISSING VALUES
print("SETMISS")
# Setting missing numeric values to the mean
dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
# Setting missing text values to 'Unknown'
dataset.fillna('', inplace=True)
# Setting missing values in other columns to NaN
dataset.dropna(inplace=True)
##### STRUCTURIZE GENRES to onehot
#serialize array
dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))
#print(dataset['genres']) # in py but not yet onehotenc
# MultiLabelBinarizer does onehotenc for arrays
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
#genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
#print(genres_df)
#dataset = pd.concat([dataset, genres_df], axis=1)
#print(dataset)
#### convert text to bag of words
## Count vs Tfidf vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
#print(tfidf_df)
##### MODEL
print("MODEL")
X = tfidf_df
y = genres_df
# remove genres that have less than min_entries entries -> probability of broken split to big
mask = (y == 1).sum() >= min_entries
print(y.shape)
y_prep = y.loc[:, mask]
print(y_prep.shape)
del mask
del y
# cleanup datapoints that dont have a target value (all target columns are 0)
mask = y_prep.sum(axis=1).map(lambda x: x > 0)
#print((mask == False).sum()) #31 cases with all target columns 0
X_clean = X[mask]
y_clean = y_prep[mask]
# clean ram edition
del dataset
del column_transformer #-
del mlb_genres
del genres_encoded
del genres_df #-
del tfidf_df
del vectorizer
del tfidf_matrix #-
del X
del y_prep
del mask
gc.collect()
# Split dataset
return train_test_split(X_clean, y_clean, random_state=0)
def comparison(X_train, X_test, y_train, y_test, estimator,): #returns class_report
multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0)
# model training
multi_target_clf.fit(X_train, y_train)
# predict against test data
y_pred = multi_target_clf.predict(X_test)
return classification_report(y_test, y_pred, zero_division=0.0)
datasets = [
#'games_march2025_cleaned_2k.csv',
'games_march2025_cleaned_10k.csv',
#'games_march2025_cleaned.csv'
]
estimators = {
#"RidgeClassifier": RidgeClassifier(random_state=0, max_iter=max_iter),
#"PassiveAggressiveClassifier": PassiveAggressiveClassifier(random_state=0, max_iter=max_iter),
#"Perceptron": Perceptron(random_state=0, max_iter=max_iter),
#"SGDClassifier": SGDClassifier(random_state=0, max_iter=max_iter),
#"NearestCentroid": NearestCentroid(),
#"LinearSVC": LinearSVC(random_state=0, max_iter=max_iter),
#"GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
"HistGradientBoostingClassifier": HistGradientBoostingClassifier(random_state=0, max_iter=max_iter),
#"LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
#"MLPClassifier": MLPClassifier(random_state=0, max_iter=int(max_iter/20), early_stopping=True),
}
#"VotingClassifier": VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]),
#"StackingClassifier": StackingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier())]),
for dataset in datasets:
print("-" * 60)
print("dataset -> " + dataset)
print("mkdir")
folder = dataset.split(".csv")[0]
if not os.path.isdir(folder):
os.mkdir(folder)
X_train, X_test, y_train, y_test = prepDataset(dataset)
for esti in estimators:
print("model: " + esti)
compari = comparison(X_train, X_test, y_train, y_test, estimators[esti])
print("open")
f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8")
f.write(compari)
print("write")
f.close()
print("close")