first version of the plot and some noose

This commit is contained in:
Maximilian Kany
2025-08-15 11:40:34 +02:00
parent ee6a31972b
commit 3975cdf7e8
39 changed files with 1520 additions and 0 deletions

18
README.md Normal file
View File

@@ -0,0 +1,18 @@
# Machine Learning Project Summer Semester 2025
This project was created as part of the "Machine Learning" course at HTW Saar in the Practical Computer Science study program.
## Objective
We are developing a Jupyter Notebook that automatically predicts the genre of Steam games based on their descriptions.
As a data basis, we use a publicly available Steam Games dataset that we found on Kaggle.
## Dataset
We use the [Steam Games Dataset from Kaggle](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data).
## Contributors
- Maximilian Kany
- Florian Speicher
- Tim Wall

140
comparison.py Normal file
View File

@@ -0,0 +1,140 @@
import os
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MultiLabelBinarizer
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
set_config(transform_output="pandas") # dataframe supremacy
def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",")
# desc, genres, tags
column_transformer = ColumnTransformer([
# merge all descriptions
('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
['detailed_description', 'about_the_game', 'short_description']),
('pass', 'passthrough', ['genres']),#, 'tags'
],
verbose_feature_names_out=False
)
dataset = column_transformer.fit_transform(dataset)
#### SET MISSING VALUES
print("SETMISS")
# Setting missing numeric values to the mean
dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
# Setting missing text values to 'Unknown'
dataset.fillna('', inplace=True)
# Setting missing values in other columns to NaN
dataset.dropna(inplace=True)
##### STRUCTURIZE GENRES to onehot
#serialize array
dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))
#print(dataset['genres']) # in py but not yet onehotenc
# MultiLabelBinarizer does onehotenc for arrays
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
#genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
#print(genres_df)
#dataset = pd.concat([dataset, genres_df], axis=1)
#print(dataset)
#### convert text to bag of words
## Count vs Tfidf vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
#print(tfidf_df)
##### MODEL
print("MODEL")
X = tfidf_df
y = genres_df
# cleanup datapoints that dont have a target value (all target columns are 0)
mask = y.sum(axis=1).map(lambda x: x > 0)
#print((mask == False).sum()) #31 cases with all target columns 0
X_clean = X[mask]
y_clean = y[mask]
# Split dataset
return train_test_split(X_clean, y_clean, random_state=0)
def comparison(X_train, X_test, y_train, y_test, estimator, jobs: int = 1): #returns class_report
multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0)
# model training
multi_target_clf.fit(X_train, y_train)
# predict against test data
y_pred = multi_target_clf.predict(X_test)
return classification_report(y_test, y_pred, zero_division=0.0)
datasets = [
'games_march2025_cleaned_2k.csv',
'games_march2025_cleaned_10k.csv',
'games_march2025_cleaned.csv'
]
estimators = {
"LogisticRegression-i1000": LogisticRegression(max_iter=1000, random_state=0),
"LogisticRegression-i10000": LogisticRegression(max_iter=10000, random_state=0),
"LinearSVC-i5000": LinearSVC(max_iter=5000),
"SVC-RBF-i10000": SVC(kernel="rbf", max_iter=10000),
"DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
"RandomForestClassifier": RandomForestClassifier(random_state=0),
"GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
"GaussianNB": GaussianNB(),
"MultinomialNB": MultinomialNB(),
"BernoulliNB": BernoulliNB(),
"MLPClassifier-i10000": MLPClassifier(max_iter=10000, random_state=0),
}
for dataset in datasets:
print("-" * 60)
print("dataset -> " + dataset)
print("-" * 60)
print("mkdir")
folder = dataset.split(".csv")[0]
if not os.path.isdir(folder):
os.mkdir(folder)
X_train, X_test, y_train, y_test = prepDataset(dataset)
for esti in estimators:
compari = comparison(X_train, X_test, y_train, y_test, estimators[esti], 1) #TODO: change the job count if you can
print("open")
f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8")
f.write(compari)
print("write")
f.close()
print("close")

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.75 0.90 0.82 300
1 0.72 0.68 0.70 216
2 0.50 0.08 0.14 86
3 0.27 0.07 0.11 46
4 0.40 0.07 0.12 83
5 0.00 0.00 0.00 0
6 0.77 0.82 0.79 245
7 0.33 0.10 0.15 42
8 0.67 0.40 0.50 127
9 0.00 0.00 0.00 12
10 0.71 0.37 0.49 127
11 0.00 0.00 0.00 14
12 0.49 0.31 0.38 106
13 0.00 0.00 0.00 0
micro avg 0.70 0.55 0.62 1404
macro avg 0.40 0.27 0.30 1404
weighted avg 0.64 0.55 0.56 1404
samples avg 0.73 0.59 0.61 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.76 0.73 0.75 300
1 0.56 0.53 0.54 216
2 0.36 0.33 0.34 86
3 0.33 0.26 0.29 46
4 0.40 0.46 0.43 83
5 0.00 0.00 0.00 0
6 0.65 0.61 0.63 245
7 0.39 0.40 0.40 42
8 0.59 0.57 0.58 127
9 0.60 0.25 0.35 12
10 0.56 0.51 0.53 127
11 0.39 0.50 0.44 14
12 0.52 0.49 0.50 106
13 0.00 0.00 0.00 0
micro avg 0.58 0.55 0.57 1404
macro avg 0.44 0.40 0.41 1404
weighted avg 0.58 0.55 0.57 1404
samples avg 0.59 0.59 0.55 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.76 0.80 0.78 300
1 0.62 0.51 0.56 216
2 0.63 0.14 0.23 86
3 0.17 0.02 0.04 46
4 0.42 0.10 0.16 83
5 0.00 0.00 0.00 0
6 0.68 0.66 0.67 245
7 0.56 0.12 0.20 42
8 0.55 0.33 0.41 127
9 0.67 0.17 0.27 12
10 0.65 0.31 0.42 127
11 1.00 0.14 0.25 14
12 0.53 0.29 0.38 106
13 0.00 0.00 0.00 0
micro avg 0.66 0.47 0.55 1404
macro avg 0.52 0.26 0.31 1404
weighted avg 0.62 0.47 0.51 1404
samples avg 0.67 0.53 0.55 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.85 0.80 0.83 300
1 0.77 0.61 0.68 216
2 0.55 0.13 0.21 86
3 0.42 0.11 0.17 46
4 0.68 0.33 0.44 83
5 0.00 0.00 0.00 0
6 0.71 0.76 0.74 245
7 0.61 0.26 0.37 42
8 0.81 0.50 0.61 127
9 0.75 0.25 0.38 12
10 0.81 0.54 0.65 127
11 0.40 0.43 0.41 14
12 0.69 0.42 0.53 106
13 0.00 0.00 0.00 0
micro avg 0.76 0.57 0.65 1404
macro avg 0.57 0.37 0.43 1404
weighted avg 0.74 0.57 0.63 1404
samples avg 0.76 0.63 0.65 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.85 0.87 0.86 300
1 0.76 0.66 0.70 216
2 0.77 0.20 0.31 86
3 0.00 0.00 0.00 46
4 0.76 0.27 0.39 83
5 0.00 0.00 0.00 0
6 0.78 0.81 0.79 245
7 0.89 0.19 0.31 42
8 0.77 0.60 0.67 127
9 1.00 0.58 0.74 12
10 0.85 0.54 0.66 127
11 1.00 0.29 0.44 14
12 0.82 0.42 0.56 106
13 0.00 0.00 0.00 0
micro avg 0.80 0.61 0.69 1404
macro avg 0.66 0.39 0.46 1404
weighted avg 0.78 0.61 0.66 1404
samples avg 0.81 0.67 0.69 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.78 0.91 0.84 300
1 0.78 0.62 0.69 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.04 0.07 83
5 0.00 0.00 0.00 0
6 0.79 0.81 0.80 245
7 0.00 0.00 0.00 42
8 0.90 0.34 0.49 127
9 0.00 0.00 0.00 12
10 0.89 0.25 0.39 127
11 0.00 0.00 0.00 14
12 0.88 0.14 0.24 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.50 0.61 1404
macro avg 0.50 0.22 0.26 1404
weighted avg 0.77 0.50 0.53 1404
samples avg 0.77 0.56 0.60 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.78 0.91 0.84 300
1 0.78 0.62 0.69 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.04 0.07 83
5 0.00 0.00 0.00 0
6 0.79 0.81 0.80 245
7 0.00 0.00 0.00 42
8 0.90 0.34 0.49 127
9 0.00 0.00 0.00 12
10 0.89 0.25 0.39 127
11 0.00 0.00 0.00 14
12 0.88 0.14 0.24 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.50 0.61 1404
macro avg 0.50 0.22 0.26 1404
weighted avg 0.77 0.50 0.53 1404
samples avg 0.77 0.56 0.60 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.84 0.85 0.84 300
1 0.73 0.67 0.70 216
2 0.74 0.30 0.43 86
3 0.50 0.02 0.04 46
4 0.69 0.24 0.36 83
5 0.00 0.00 0.00 0
6 0.79 0.79 0.79 245
7 0.86 0.14 0.24 42
8 0.76 0.63 0.69 127
9 1.00 0.33 0.50 12
10 0.81 0.52 0.63 127
11 1.00 0.14 0.25 14
12 0.75 0.41 0.53 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.60 0.68 1404
macro avg 0.68 0.36 0.43 1404
weighted avg 0.78 0.60 0.65 1404
samples avg 0.80 0.66 0.68 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.64 0.99 0.78 300
1 0.85 0.24 0.37 216
2 0.60 0.03 0.07 86
3 0.00 0.00 0.00 46
4 0.80 0.05 0.09 83
5 0.00 0.00 0.00 0
6 0.78 0.80 0.79 245
7 0.40 0.05 0.09 42
8 1.00 0.04 0.08 127
9 0.00 0.00 0.00 12
10 0.20 0.01 0.02 127
11 0.00 0.00 0.00 14
12 1.00 0.05 0.09 106
13 0.00 0.00 0.00 0
micro avg 0.69 0.40 0.51 1404
macro avg 0.45 0.16 0.17 1404
weighted avg 0.68 0.40 0.39 1404
samples avg 0.70 0.44 0.50 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.80 0.88 0.84 300
1 0.78 0.55 0.64 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.06 0.11 83
5 0.00 0.00 0.00 0
6 0.74 0.78 0.76 245
7 0.00 0.00 0.00 42
8 0.84 0.24 0.38 127
9 0.00 0.00 0.00 12
10 0.91 0.24 0.38 127
11 1.00 0.14 0.25 14
12 1.00 0.25 0.39 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.48 0.59 1404
macro avg 0.58 0.23 0.27 1404
weighted avg 0.78 0.48 0.52 1404
samples avg 0.77 0.54 0.60 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.81 0.90 0.85 300
1 0.76 0.63 0.69 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.05 0.09 83
5 0.00 0.00 0.00 0
6 0.77 0.83 0.80 245
7 0.00 0.00 0.00 42
8 0.84 0.40 0.54 127
9 1.00 0.17 0.29 12
10 0.90 0.34 0.49 127
11 1.00 0.14 0.25 14
12 0.92 0.21 0.34 106
13 0.00 0.00 0.00 0
micro avg 0.80 0.53 0.63 1404
macro avg 0.64 0.26 0.32 1404
weighted avg 0.79 0.53 0.56 1404
samples avg 0.79 0.59 0.63 1404

BIN
games_march2025_cleaned_10k.csv LFS Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:12cf598a6e41d83cfa9c16e99d4d9578cb4ee7c3594fae9f9b921772887a08d7
3 size 68658136

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.75 0.90 0.82 300
1 0.72 0.68 0.70 216
2 0.50 0.08 0.14 86
3 0.27 0.07 0.11 46
4 0.40 0.07 0.12 83
5 0.00 0.00 0.00 0
6 0.77 0.82 0.79 245
7 0.33 0.10 0.15 42
8 0.67 0.40 0.50 127
9 0.00 0.00 0.00 12
10 0.71 0.37 0.49 127
11 0.00 0.00 0.00 14
12 0.49 0.31 0.38 106
13 0.00 0.00 0.00 0
micro avg 0.70 0.55 0.62 1404
macro avg 0.40 0.27 0.30 1404
weighted avg 0.64 0.55 0.56 1404
samples avg 0.73 0.59 0.61 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.76 0.73 0.75 300
1 0.56 0.53 0.54 216
2 0.36 0.33 0.34 86
3 0.33 0.26 0.29 46
4 0.40 0.46 0.43 83
5 0.00 0.00 0.00 0
6 0.65 0.61 0.63 245
7 0.39 0.40 0.40 42
8 0.59 0.57 0.58 127
9 0.60 0.25 0.35 12
10 0.56 0.51 0.53 127
11 0.39 0.50 0.44 14
12 0.52 0.49 0.50 106
13 0.00 0.00 0.00 0
micro avg 0.58 0.55 0.57 1404
macro avg 0.44 0.40 0.41 1404
weighted avg 0.58 0.55 0.57 1404
samples avg 0.59 0.59 0.55 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.76 0.80 0.78 300
1 0.62 0.51 0.56 216
2 0.63 0.14 0.23 86
3 0.17 0.02 0.04 46
4 0.42 0.10 0.16 83
5 0.00 0.00 0.00 0
6 0.68 0.66 0.67 245
7 0.56 0.12 0.20 42
8 0.55 0.33 0.41 127
9 0.67 0.17 0.27 12
10 0.65 0.31 0.42 127
11 1.00 0.14 0.25 14
12 0.53 0.29 0.38 106
13 0.00 0.00 0.00 0
micro avg 0.66 0.47 0.55 1404
macro avg 0.52 0.26 0.31 1404
weighted avg 0.62 0.47 0.51 1404
samples avg 0.67 0.53 0.55 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.85 0.80 0.83 300
1 0.77 0.61 0.68 216
2 0.55 0.13 0.21 86
3 0.42 0.11 0.17 46
4 0.68 0.33 0.44 83
5 0.00 0.00 0.00 0
6 0.71 0.76 0.74 245
7 0.61 0.26 0.37 42
8 0.81 0.50 0.61 127
9 0.75 0.25 0.38 12
10 0.81 0.54 0.65 127
11 0.40 0.43 0.41 14
12 0.69 0.42 0.53 106
13 0.00 0.00 0.00 0
micro avg 0.76 0.57 0.65 1404
macro avg 0.57 0.37 0.43 1404
weighted avg 0.74 0.57 0.63 1404
samples avg 0.76 0.63 0.65 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.85 0.87 0.86 300
1 0.76 0.66 0.70 216
2 0.77 0.20 0.31 86
3 0.00 0.00 0.00 46
4 0.76 0.27 0.39 83
5 0.00 0.00 0.00 0
6 0.78 0.81 0.79 245
7 0.89 0.19 0.31 42
8 0.77 0.60 0.67 127
9 1.00 0.58 0.74 12
10 0.85 0.54 0.66 127
11 1.00 0.29 0.44 14
12 0.82 0.42 0.56 106
13 0.00 0.00 0.00 0
micro avg 0.80 0.61 0.69 1404
macro avg 0.66 0.39 0.46 1404
weighted avg 0.78 0.61 0.66 1404
samples avg 0.81 0.67 0.69 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.78 0.91 0.84 300
1 0.78 0.62 0.69 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.04 0.07 83
5 0.00 0.00 0.00 0
6 0.79 0.81 0.80 245
7 0.00 0.00 0.00 42
8 0.90 0.34 0.49 127
9 0.00 0.00 0.00 12
10 0.89 0.25 0.39 127
11 0.00 0.00 0.00 14
12 0.88 0.14 0.24 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.50 0.61 1404
macro avg 0.50 0.22 0.26 1404
weighted avg 0.77 0.50 0.53 1404
samples avg 0.77 0.56 0.60 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.78 0.91 0.84 300
1 0.78 0.62 0.69 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.04 0.07 83
5 0.00 0.00 0.00 0
6 0.79 0.81 0.80 245
7 0.00 0.00 0.00 42
8 0.90 0.34 0.49 127
9 0.00 0.00 0.00 12
10 0.89 0.25 0.39 127
11 0.00 0.00 0.00 14
12 0.88 0.14 0.24 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.50 0.61 1404
macro avg 0.50 0.22 0.26 1404
weighted avg 0.77 0.50 0.53 1404
samples avg 0.77 0.56 0.60 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.84 0.85 0.84 300
1 0.73 0.67 0.70 216
2 0.74 0.30 0.43 86
3 0.50 0.02 0.04 46
4 0.69 0.24 0.36 83
5 0.00 0.00 0.00 0
6 0.79 0.79 0.79 245
7 0.86 0.14 0.24 42
8 0.76 0.63 0.69 127
9 1.00 0.33 0.50 12
10 0.81 0.52 0.63 127
11 1.00 0.14 0.25 14
12 0.75 0.41 0.53 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.60 0.68 1404
macro avg 0.68 0.36 0.43 1404
weighted avg 0.78 0.60 0.65 1404
samples avg 0.80 0.66 0.68 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.64 0.99 0.78 300
1 0.85 0.24 0.37 216
2 0.60 0.03 0.07 86
3 0.00 0.00 0.00 46
4 0.80 0.05 0.09 83
5 0.00 0.00 0.00 0
6 0.78 0.80 0.79 245
7 0.40 0.05 0.09 42
8 1.00 0.04 0.08 127
9 0.00 0.00 0.00 12
10 0.20 0.01 0.02 127
11 0.00 0.00 0.00 14
12 1.00 0.05 0.09 106
13 0.00 0.00 0.00 0
micro avg 0.69 0.40 0.51 1404
macro avg 0.45 0.16 0.17 1404
weighted avg 0.68 0.40 0.39 1404
samples avg 0.70 0.44 0.50 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.80 0.88 0.84 300
1 0.78 0.55 0.64 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.06 0.11 83
5 0.00 0.00 0.00 0
6 0.74 0.78 0.76 245
7 0.00 0.00 0.00 42
8 0.84 0.24 0.38 127
9 0.00 0.00 0.00 12
10 0.91 0.24 0.38 127
11 1.00 0.14 0.25 14
12 1.00 0.25 0.39 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.48 0.59 1404
macro avg 0.58 0.23 0.27 1404
weighted avg 0.78 0.48 0.52 1404
samples avg 0.77 0.54 0.60 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.81 0.90 0.85 300
1 0.76 0.63 0.69 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.05 0.09 83
5 0.00 0.00 0.00 0
6 0.77 0.83 0.80 245
7 0.00 0.00 0.00 42
8 0.84 0.40 0.54 127
9 1.00 0.17 0.29 12
10 0.90 0.34 0.49 127
11 1.00 0.14 0.25 14
12 0.92 0.21 0.34 106
13 0.00 0.00 0.00 0
micro avg 0.80 0.53 0.63 1404
macro avg 0.64 0.26 0.32 1404
weighted avg 0.79 0.53 0.56 1404
samples avg 0.79 0.59 0.63 1404

BIN
games_march2025_cleaned_2k.csv LFS Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:75ba38404995149bcb8e5a321459f73b4adf58597f85bab396dd054cc78c145d
3 size 15455174

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.75 0.90 0.82 300
1 0.72 0.68 0.70 216
2 0.50 0.08 0.14 86
3 0.27 0.07 0.11 46
4 0.40 0.07 0.12 83
5 0.00 0.00 0.00 0
6 0.77 0.82 0.79 245
7 0.33 0.10 0.15 42
8 0.67 0.40 0.50 127
9 0.00 0.00 0.00 12
10 0.71 0.37 0.49 127
11 0.00 0.00 0.00 14
12 0.49 0.31 0.38 106
13 0.00 0.00 0.00 0
micro avg 0.70 0.55 0.62 1404
macro avg 0.40 0.27 0.30 1404
weighted avg 0.64 0.55 0.56 1404
samples avg 0.73 0.59 0.61 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.76 0.73 0.75 300
1 0.56 0.53 0.54 216
2 0.36 0.33 0.34 86
3 0.33 0.26 0.29 46
4 0.40 0.46 0.43 83
5 0.00 0.00 0.00 0
6 0.65 0.61 0.63 245
7 0.39 0.40 0.40 42
8 0.59 0.57 0.58 127
9 0.60 0.25 0.35 12
10 0.56 0.51 0.53 127
11 0.39 0.50 0.44 14
12 0.52 0.49 0.50 106
13 0.00 0.00 0.00 0
micro avg 0.58 0.55 0.57 1404
macro avg 0.44 0.40 0.41 1404
weighted avg 0.58 0.55 0.57 1404
samples avg 0.59 0.59 0.55 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.76 0.80 0.78 300
1 0.62 0.51 0.56 216
2 0.63 0.14 0.23 86
3 0.17 0.02 0.04 46
4 0.42 0.10 0.16 83
5 0.00 0.00 0.00 0
6 0.68 0.66 0.67 245
7 0.56 0.12 0.20 42
8 0.55 0.33 0.41 127
9 0.67 0.17 0.27 12
10 0.65 0.31 0.42 127
11 1.00 0.14 0.25 14
12 0.53 0.29 0.38 106
13 0.00 0.00 0.00 0
micro avg 0.66 0.47 0.55 1404
macro avg 0.52 0.26 0.31 1404
weighted avg 0.62 0.47 0.51 1404
samples avg 0.67 0.53 0.55 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.85 0.80 0.83 300
1 0.77 0.61 0.68 216
2 0.55 0.13 0.21 86
3 0.42 0.11 0.17 46
4 0.68 0.33 0.44 83
5 0.00 0.00 0.00 0
6 0.71 0.76 0.74 245
7 0.61 0.26 0.37 42
8 0.81 0.50 0.61 127
9 0.75 0.25 0.38 12
10 0.81 0.54 0.65 127
11 0.40 0.43 0.41 14
12 0.69 0.42 0.53 106
13 0.00 0.00 0.00 0
micro avg 0.76 0.57 0.65 1404
macro avg 0.57 0.37 0.43 1404
weighted avg 0.74 0.57 0.63 1404
samples avg 0.76 0.63 0.65 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.85 0.87 0.86 300
1 0.76 0.66 0.70 216
2 0.77 0.20 0.31 86
3 0.00 0.00 0.00 46
4 0.76 0.27 0.39 83
5 0.00 0.00 0.00 0
6 0.78 0.81 0.79 245
7 0.89 0.19 0.31 42
8 0.77 0.60 0.67 127
9 1.00 0.58 0.74 12
10 0.85 0.54 0.66 127
11 1.00 0.29 0.44 14
12 0.82 0.42 0.56 106
13 0.00 0.00 0.00 0
micro avg 0.80 0.61 0.69 1404
macro avg 0.66 0.39 0.46 1404
weighted avg 0.78 0.61 0.66 1404
samples avg 0.81 0.67 0.69 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.78 0.91 0.84 300
1 0.78 0.62 0.69 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.04 0.07 83
5 0.00 0.00 0.00 0
6 0.79 0.81 0.80 245
7 0.00 0.00 0.00 42
8 0.90 0.34 0.49 127
9 0.00 0.00 0.00 12
10 0.89 0.25 0.39 127
11 0.00 0.00 0.00 14
12 0.88 0.14 0.24 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.50 0.61 1404
macro avg 0.50 0.22 0.26 1404
weighted avg 0.77 0.50 0.53 1404
samples avg 0.77 0.56 0.60 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.78 0.91 0.84 300
1 0.78 0.62 0.69 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.04 0.07 83
5 0.00 0.00 0.00 0
6 0.79 0.81 0.80 245
7 0.00 0.00 0.00 42
8 0.90 0.34 0.49 127
9 0.00 0.00 0.00 12
10 0.89 0.25 0.39 127
11 0.00 0.00 0.00 14
12 0.88 0.14 0.24 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.50 0.61 1404
macro avg 0.50 0.22 0.26 1404
weighted avg 0.77 0.50 0.53 1404
samples avg 0.77 0.56 0.60 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.84 0.85 0.84 300
1 0.73 0.67 0.70 216
2 0.74 0.30 0.43 86
3 0.50 0.02 0.04 46
4 0.69 0.24 0.36 83
5 0.00 0.00 0.00 0
6 0.79 0.79 0.79 245
7 0.86 0.14 0.24 42
8 0.76 0.63 0.69 127
9 1.00 0.33 0.50 12
10 0.81 0.52 0.63 127
11 1.00 0.14 0.25 14
12 0.75 0.41 0.53 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.60 0.68 1404
macro avg 0.68 0.36 0.43 1404
weighted avg 0.78 0.60 0.65 1404
samples avg 0.80 0.66 0.68 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.64 0.99 0.78 300
1 0.85 0.24 0.37 216
2 0.60 0.03 0.07 86
3 0.00 0.00 0.00 46
4 0.80 0.05 0.09 83
5 0.00 0.00 0.00 0
6 0.78 0.80 0.79 245
7 0.40 0.05 0.09 42
8 1.00 0.04 0.08 127
9 0.00 0.00 0.00 12
10 0.20 0.01 0.02 127
11 0.00 0.00 0.00 14
12 1.00 0.05 0.09 106
13 0.00 0.00 0.00 0
micro avg 0.69 0.40 0.51 1404
macro avg 0.45 0.16 0.17 1404
weighted avg 0.68 0.40 0.39 1404
samples avg 0.70 0.44 0.50 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.80 0.88 0.84 300
1 0.78 0.55 0.64 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.06 0.11 83
5 0.00 0.00 0.00 0
6 0.74 0.78 0.76 245
7 0.00 0.00 0.00 42
8 0.84 0.24 0.38 127
9 0.00 0.00 0.00 12
10 0.91 0.24 0.38 127
11 1.00 0.14 0.25 14
12 1.00 0.25 0.39 106
13 0.00 0.00 0.00 0
micro avg 0.79 0.48 0.59 1404
macro avg 0.58 0.23 0.27 1404
weighted avg 0.78 0.48 0.52 1404
samples avg 0.77 0.54 0.60 1404

View File

@@ -0,0 +1,21 @@
precision recall f1-score support
0 0.81 0.90 0.85 300
1 0.76 0.63 0.69 216
2 1.00 0.03 0.07 86
3 0.00 0.00 0.00 46
4 1.00 0.05 0.09 83
5 0.00 0.00 0.00 0
6 0.77 0.83 0.80 245
7 0.00 0.00 0.00 42
8 0.84 0.40 0.54 127
9 1.00 0.17 0.29 12
10 0.90 0.34 0.49 127
11 1.00 0.14 0.25 14
12 0.92 0.21 0.34 106
13 0.00 0.00 0.00 0
micro avg 0.80 0.53 0.63 1404
macro avg 0.64 0.26 0.32 1404
weighted avg 0.79 0.53 0.56 1404
samples avg 0.79 0.59 0.63 1404

530
notebook.ipynb Normal file
View File

@@ -0,0 +1,530 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "a3a7634f",
"metadata": {},
"source": [
"# Machine Learning project in SoSe 2025 at HTW Saar\n",
"## Idea\n",
"The goal of this project is predicting the genre(s) of a game/bundle through its given description(s)\n",
"\n",
"## Dataset\n",
"For our project we use a Steam Dataset provided on moodle, since it has all information we plan on using.\n",
"The Dataset has been cut to only 2000 data points to be runnable on weaker devices."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3116b75f",
"metadata": {
"jupyter": {
"is_executing": true
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" appid name release_date required_age price dlc_count \\\n",
"0 730 Counter-Strike 2 2012-08-21 0 0.0 1 \n",
"\n",
" detailed_description \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"\n",
" about_the_game \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"\n",
" short_description reviews ... \\\n",
"0 For over two decades, Counter-Strike has offer... NaN ... \n",
"\n",
" average_playtime_2weeks median_playtime_forever median_playtime_2weeks \\\n",
"0 879 5174 350 \n",
"\n",
" discount peak_ccu tags \\\n",
"0 0 1212356 {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'... \n",
"\n",
" pct_pos_total num_reviews_total pct_pos_recent num_reviews_recent \n",
"0 86 8632939 82 96473 \n",
"\n",
"[1 rows x 47 columns]\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn import set_config\n",
"\n",
"set_config(transform_output=\"pandas\")\n",
"\n",
"dataset = pd.read_csv(\"./games_march2025_cleaned_2k.csv\",sep=\",\")\n",
"print(dataset.head(1))"
]
},
{
"cell_type": "markdown",
"id": "cba9750a",
"metadata": {},
"source": [
"## Preparation of the Dataset\n",
"### Removing Uniques\n",
"We would remove the following features from the Training-Set as they can/could uniquely identify a datapoint, but we don't as they will be removed in the next step anyway\n",
"- AppId\n",
"- Name of the Game\n",
"- Realease Date\n",
"- Reviews\n",
"- Header Image\n",
"- Website\n",
"- Support URL\n",
"- Support Email\n",
"- MetaCritic URL\n",
"- Developer\n",
"- Publisher\n",
"- Screenshots\n",
"- Movies\n",
"- Estimated Owners"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d159117377f3633c",
"metadata": {},
"outputs": [],
"source": [
"#dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n",
"#print(dataset.head())"
]
},
{
"cell_type": "markdown",
"id": "e1b28ddd69f1e9a6",
"metadata": {},
"source": [
"## Hold onto necessary information\n",
"Our model should turn a textual description of a game into its genre. For that we need all the textual information a game has, as well as the genres of the game.\n",
"We use a ColumnTransformer to drop all unnecessary lines, merge all descriptions of a game into one big description and hold onto the genres\n",
"\n",
"It is important to use ``verbose_feature_names_out=False`` so the feature names don't get changed"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "986fbb31a7ae0d8b",
"metadata": {
"jupyter": {
"is_executing": true
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" desc \\\n",
"0 For over two decades, Counter-Strike has offer... \n",
"1 LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ... \n",
"2 The most-played game on Steam. Every day, mill... \n",
"3 When a young street hustler, a retired bank ro... \n",
"4 Edition Comparison Ultimate Edition The Tom Cl... \n",
"\n",
" genres \n",
"0 ['Action', 'Free To Play'] \n",
"1 ['Action', 'Adventure', 'Massively Multiplayer... \n",
"2 ['Action', 'Strategy', 'Free To Play'] \n",
"3 ['Action', 'Adventure'] \n",
"4 ['Action'] \n"
]
}
],
"source": [
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import FunctionTransformer\n",
"\n",
"# desc, genres\n",
"column_transformer = ColumnTransformer([\n",
" # merge all descriptions\n",
" ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name=\"desc\")),\n",
" ['detailed_description', 'about_the_game', 'short_description']),\n",
" ('pass', 'passthrough', ['genres']),\n",
" ],\n",
" verbose_feature_names_out=False\n",
")\n",
"dataset = column_transformer.fit_transform(dataset)\n",
"print(dataset.head())"
]
},
{
"cell_type": "markdown",
"id": "f9b89c0645811564",
"metadata": {},
"source": [
"### Adding missing Information\n",
"Some Games might not have any descriptions. For these we Input an Empty String\n",
"**TODO: check if dropna and fillna numeric_only is needed, as we dont have any numbers**"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "44239f6b7fd23cde",
"metadata": {},
"outputs": [],
"source": [
"# missing numeric values => mean\n",
"dataset.fillna(dataset.mean(numeric_only=True), inplace=True)\n",
"# missing strings => empty string?\n",
"dataset.fillna('', inplace=True)\n",
"# drop all lines with missing values\n",
"dataset.dropna(inplace=True)"
]
},
{
"cell_type": "markdown",
"id": "ca5b59b9fa8160a0",
"metadata": {},
"source": [
"## Transform Genres\n",
"The genre information currently is a string holding a python array of genres. While this is machine-readable, we need One-Hot-Encoding for our model to work.\n",
"\n",
"#### Serializing the String-Array\n",
"The \"ast\" library can interpret python strings as python code, and as such will be used for serializing the genres."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ebc5a24e9bc87fdd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 [Action, Free To Play]\n",
"1 [Action, Adventure, Massively Multiplayer, Fre...\n",
"2 [Action, Strategy, Free To Play]\n",
"3 [Action, Adventure]\n",
"4 [Action]\n",
"Name: genres, dtype: object\n"
]
}
],
"source": [
"import ast\n",
"\n",
"dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))\n",
"print(dataset['genres'].head())"
]
},
{
"cell_type": "markdown",
"id": "f90756f9ad9211f4",
"metadata": {},
"source": [
"#### One-Hot-Encoding an Python-Array\n",
"The sklearn ``OneHotEncoder()`` is only able to work with an 1D Array of different classes, such as ``['Politics', 'Sport', 'Culture']``. Every datapoint can only have one concurrent classification.\n",
"Steam allows an app/bundle to have multiple genres. As such, our dataset has an 2D Array of different classes, which sklearn's ``MultiLabelBinarizer()`` does support."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2c3527a5fc876bf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Action Adventure Casual Early Access Free To Play Gore Indie \\\n",
"0 1 0 0 0 1 0 0 \n",
"1 1 1 0 0 1 0 0 \n",
"2 1 0 0 0 1 0 0 \n",
"3 1 1 0 0 0 0 0 \n",
"4 1 0 0 0 0 0 0 \n",
"\n",
" Massively Multiplayer RPG Racing Simulation Sports Strategy Violent \n",
"0 0 0 0 0 0 0 0 \n",
"1 1 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 1 0 \n",
"3 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 \n"
]
}
],
"source": [
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"\n",
"mlb_genres = MultiLabelBinarizer()\n",
"genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))\n",
"genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)\n",
"print(genres_df.head())"
]
},
{
"cell_type": "markdown",
"id": "671c01f9f4ae66d9",
"metadata": {},
"source": [
"With this, our target matrix is completed."
]
},
{
"cell_type": "markdown",
"id": "f5436c87",
"metadata": {},
"source": [
"### Structurizing Text\n",
"If we want our Model to be able to use text as an input, we have to vectorize the text. TF-IDF (Inverse Document Frequency) is an easy way of transforming each word into a feature with a 0 to 1 value. **TODO: filter out stopwords**"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e8b407c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n",
"0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"\n",
" 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
"[5 rows x 29351 columns]\n"
]
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"vectorizer = TfidfVectorizer()\n",
"tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix, not pandas df\n",
"tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())\n",
"print(tfidf_df.head())"
]
},
{
"cell_type": "markdown",
"id": "ad84e777",
"metadata": {},
"source": [
"With this our feature matrix is completed"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86d9da42f4df8e49",
"metadata": {},
"outputs": [],
"source": [
"X = tfidf_df\n",
"y = genres_df"
]
},
{
"cell_type": "markdown",
"id": "aeb782668f311cd8",
"metadata": {},
"source": [
"## The Model\n",
"\n",
"#### Removing unpredicatble Datapoints\n",
"Some Datapoints don't have a genre assigned (all feature values in y are 0). The model we use can't handle such cases, thus they have to be removed.\n",
"We filter after all values that we can use with a mask, and apply that mask to our matrices."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4919bf1b37d171a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"13\n"
]
}
],
"source": [
"mask = y.sum(axis=1).map(lambda x: x > 0)\n",
"print((mask == False).sum()) # count of unpredictable datapoints\n",
"\n",
"X_clean = X[mask]\n",
"y_clean = y[mask]"
]
},
{
"cell_type": "markdown",
"id": "091d7e13",
"metadata": {},
"source": [
"# Splitting up data\n",
"We have to split up our data into training and testing data.\n",
"Using random_state=0 guarantees reproducability."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cfbf3787",
"metadata": {
"jupyter": {
"is_executing": true
}
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
]
},
{
"cell_type": "markdown",
"id": "12b5283d",
"metadata": {},
"source": [
"# Model Selection\n",
"**TODO Deciding which model to use for this task**\n",
"\n",
"As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LogisticRegression``"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c1d72c4532bd509",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.multioutput import MultiOutputClassifier\n",
"\n",
"# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is to bad)\n",
"multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)\n",
"\n",
"multi_target_clf.fit(X_train, y_train)\n",
"\n",
"y_pred = multi_target_clf.predict(X_test)"
]
},
{
"cell_type": "markdown",
"id": "0faa9856",
"metadata": {},
"source": [
"# Evaluation\n",
"**TODO Test the Model with the test data**"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2ebea6945193e07",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.78 0.91 0.84 300\n",
" 1 0.78 0.62 0.69 216\n",
" 2 1.00 0.03 0.07 86\n",
" 3 0.00 0.00 0.00 46\n",
" 4 1.00 0.04 0.07 83\n",
" 5 0.00 0.00 0.00 0\n",
" 6 0.79 0.81 0.80 245\n",
" 7 0.00 0.00 0.00 42\n",
" 8 0.90 0.34 0.49 127\n",
" 9 0.00 0.00 0.00 12\n",
" 10 0.89 0.25 0.39 127\n",
" 11 0.00 0.00 0.00 14\n",
" 12 0.88 0.14 0.24 106\n",
" 13 0.00 0.00 0.00 0\n",
"\n",
" micro avg 0.79 0.50 0.61 1404\n",
" macro avg 0.50 0.22 0.26 1404\n",
"weighted avg 0.77 0.50 0.53 1404\n",
" samples avg 0.77 0.56 0.60 1404\n",
"\n"
]
}
],
"source": [
"from sklearn.metrics import classification_report\n",
"\n",
"print(classification_report(y_test, y_pred, zero_division=0.0))"
]
},
{
"cell_type": "markdown",
"id": "2aeb6fc2",
"metadata": {},
"source": [
"# Optimization\n",
"**TODO optimize the model based on the test results**"
]
},
{
"cell_type": "markdown",
"id": "79b20645",
"metadata": {},
"source": [
"# Validation\n",
"**TODO Predict actual values**"
]
},
{
"cell_type": "markdown",
"id": "3b709fb7",
"metadata": {},
"source": [
"# Conclusion and outlook\n",
"**TODO Write a conclusion and outlook what can be done and where the issues were.**"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

133
test_script.py Normal file
View File

@@ -0,0 +1,133 @@
#### INITIALIZE
import numpy as np
import pandas as pd
from sklearn import set_config
set_config(transform_output="pandas") # dataframe supremacy
# load data
# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",")
print(dataset.head())
#### DROP UNIQUES
print("DROP")
#TODO: wird eh unten beim transformer deleted
# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
#dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email',
# 'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'],
# axis=1, inplace=True)
#print(dataset.head())
#### STRUCTURIZE AND STANDARDIZE
print("STRUCTURE")
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
# desc, genres, tags
column_transformer = ColumnTransformer([
# merge all descriptions
('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
['detailed_description', 'about_the_game', 'short_description']),
# genre -> actual genre, but very coarse
# tags -> user defined tags; title num list
#TODO: decide whether we drop tags
('pass', 'passthrough', ['genres']),#, 'tags'
],
verbose_feature_names_out=False
)
dataset = column_transformer.fit_transform(dataset)
print(dataset)
#### SET MISSING VALUES
print("SETMISS")
# Setting missing numeric values to the mean
dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
# Setting missing text values to 'Unknown'
dataset.fillna('', inplace=True)
# Setting missing values in other columns to NaN
dataset.dropna(inplace=True)
##### STRUCTURIZE GENRES to onehot
from sklearn.preprocessing import MultiLabelBinarizer
import ast
#serialize array
dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))
print(dataset['genres']) # in py but not yet onehotenc
# MultiLabelBinarizer does onehotenc for arrays
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
print(genres_df)
#dataset = pd.concat([dataset, genres_df], axis=1)
#print(dataset)
#### convert text to bag of words
## Count vs Tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(tfidf_df)
##### MODEL
print("MODEL")
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
X = tfidf_df
y = genres_df
# cleanup datapoints that dont have a target value (all target columns are 0)
mask = y.sum(axis=1).map(lambda x: x > 0)
#print((mask == False).sum()) #31 cases with all target columns 0
X_clean = X[mask]
y_clean = y[mask]
# Split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)
# we want to have multiple possible outputs (multi-label-classficiation) -> multioutputclassifier
# logi regression is our base system
# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is too bad)
multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)
# model training
multi_target_clf.fit(X_train, y_train)
# predict against test data
y_pred = multi_target_clf.predict(X_test)
# print prec, recall, f1 etc
print(classification_report(y_test, y_pred, zero_division=0.0))
#print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")