diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4417fc3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,18 @@
+# Machine Learning Project – Summer Semester 2025
+
+This project was created as part of the "Machine Learning" course at HTW Saar in the Practical Computer Science study program.
+
+## Objective
+
+We are developing a Jupyter Notebook that automatically predicts the genre of Steam games based on their descriptions.  
+As a data basis, we use a publicly available Steam Games dataset that we found on Kaggle.
+
+## Dataset
+
+We use the [Steam Games Dataset from Kaggle](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data).
+
+## Contributors
+
+- Maximilian Kany
+- Florian Speicher
+- Tim Wall
\ No newline at end of file
diff --git a/comparison.py b/comparison.py
new file mode 100644
index 0000000..fcced39
--- /dev/null
+++ b/comparison.py
@@ -0,0 +1,140 @@
+import os
+import numpy as np
+import pandas as pd
+from sklearn import set_config
+
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import FunctionTransformer
+
+from sklearn.preprocessing import MultiLabelBinarizer
+import ast
+
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.multioutput import MultiOutputClassifier
+from sklearn.metrics import classification_report
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_iris
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.svm import SVC, LinearSVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.neural_network import MLPClassifier
+
+
+set_config(transform_output="pandas") # dataframe supremacy
+
+def prepDataset(dataset): #returns X_train, X_test, y_train, y_test
+    dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",")
+    # desc, genres, tags
+    column_transformer = ColumnTransformer([
+            # merge all descriptions
+            ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
+                ['detailed_description', 'about_the_game', 'short_description']),
+            ('pass', 'passthrough', ['genres']),#, 'tags'
+        ],
+        verbose_feature_names_out=False
+    )
+    dataset = column_transformer.fit_transform(dataset)
+
+
+
+    #### SET MISSING VALUES
+    print("SETMISS")
+    # Setting missing numeric values to the mean
+    dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
+    # Setting missing text values to 'Unknown'
+    dataset.fillna('', inplace=True)
+    # Setting missing values in other columns to NaN
+    dataset.dropna(inplace=True)
+
+    ##### STRUCTURIZE GENRES to onehot
+    #serialize array
+    dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) 
+    #print(dataset['genres']) # in py but not yet onehotenc
+
+    # MultiLabelBinarizer does onehotenc for arrays
+    mlb_genres = MultiLabelBinarizer()
+    genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
+    #genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
+
+    genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
+    #print(genres_df)
+    #dataset = pd.concat([dataset, genres_df], axis=1)
+    #print(dataset)
+
+
+    #### convert text to bag of words
+
+    ## Count vs Tfidf vectorizer
+    vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
+    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
+    #print(tfidf_df)
+
+
+    ##### MODEL
+    print("MODEL")
+
+
+    X = tfidf_df
+    y = genres_df
+    # cleanup datapoints that dont have a target value (all target columns are 0)
+    mask = y.sum(axis=1).map(lambda x: x > 0)
+    #print((mask == False).sum()) #31 cases with all target columns 0
+    X_clean = X[mask]
+    y_clean = y[mask]
+
+    # Split dataset
+    return train_test_split(X_clean, y_clean, random_state=0)
+
+def comparison(X_train, X_test, y_train, y_test, estimator, jobs: int = 1): #returns class_report
+    multi_target_clf = MultiOutputClassifier(estimator, n_jobs=jobs) # LogisticRegression(max_iter=1337, random_state=0)
+
+    # model training
+    multi_target_clf.fit(X_train, y_train)
+
+    # predict against test data
+    y_pred = multi_target_clf.predict(X_test)
+    return classification_report(y_test, y_pred, zero_division=0.0)
+
+datasets = [
+    'games_march2025_cleaned_2k.csv',
+    'games_march2025_cleaned_10k.csv',
+    'games_march2025_cleaned.csv'
+]
+
+estimators = {
+    "LogisticRegression-i1000": LogisticRegression(max_iter=1000, random_state=0),
+    "LogisticRegression-i10000": LogisticRegression(max_iter=10000, random_state=0),
+    "LinearSVC-i5000": LinearSVC(max_iter=5000),
+    "SVC-RBF-i10000": SVC(kernel="rbf", max_iter=10000),
+    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
+    "RandomForestClassifier": RandomForestClassifier(random_state=0),
+    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
+    "GaussianNB": GaussianNB(),
+    "MultinomialNB": MultinomialNB(),
+    "BernoulliNB": BernoulliNB(),
+    "MLPClassifier-i10000": MLPClassifier(max_iter=10000, random_state=0),
+}
+
+for dataset in datasets:
+    print("-" * 60)
+    print("dataset -> " + dataset)
+    print("-" * 60)
+    print("mkdir")
+    folder = dataset.split(".csv")[0]
+    if not os.path.isdir(folder):
+        os.mkdir(folder)
+    X_train, X_test, y_train, y_test = prepDataset(dataset)
+    for esti in estimators:
+        compari = comparison(X_train, X_test, y_train, y_test, estimators[esti], 1) #TODO: change the job count if you can
+        print("open")
+        f = open(folder + "/" + esti +".txt", mode="w+", encoding="utf-8")
+        f.write(compari)
+        print("write")
+        f.close()
+        print("close")
\ No newline at end of file
diff --git a/games_march2025_cleaned/BernoulliNB.txt b/games_march2025_cleaned/BernoulliNB.txt
new file mode 100644
index 0000000..f2237d4
--- /dev/null
+++ b/games_march2025_cleaned/BernoulliNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.75      0.90      0.82       300
+           1       0.72      0.68      0.70       216
+           2       0.50      0.08      0.14        86
+           3       0.27      0.07      0.11        46
+           4       0.40      0.07      0.12        83
+           5       0.00      0.00      0.00         0
+           6       0.77      0.82      0.79       245
+           7       0.33      0.10      0.15        42
+           8       0.67      0.40      0.50       127
+           9       0.00      0.00      0.00        12
+          10       0.71      0.37      0.49       127
+          11       0.00      0.00      0.00        14
+          12       0.49      0.31      0.38       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.70      0.55      0.62      1404
+   macro avg       0.40      0.27      0.30      1404
+weighted avg       0.64      0.55      0.56      1404
+ samples avg       0.73      0.59      0.61      1404
diff --git a/games_march2025_cleaned/DecisionTreeClassifier.txt b/games_march2025_cleaned/DecisionTreeClassifier.txt
new file mode 100644
index 0000000..900c256
--- /dev/null
+++ b/games_march2025_cleaned/DecisionTreeClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.76      0.73      0.75       300
+           1       0.56      0.53      0.54       216
+           2       0.36      0.33      0.34        86
+           3       0.33      0.26      0.29        46
+           4       0.40      0.46      0.43        83
+           5       0.00      0.00      0.00         0
+           6       0.65      0.61      0.63       245
+           7       0.39      0.40      0.40        42
+           8       0.59      0.57      0.58       127
+           9       0.60      0.25      0.35        12
+          10       0.56      0.51      0.53       127
+          11       0.39      0.50      0.44        14
+          12       0.52      0.49      0.50       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.58      0.55      0.57      1404
+   macro avg       0.44      0.40      0.41      1404
+weighted avg       0.58      0.55      0.57      1404
+ samples avg       0.59      0.59      0.55      1404
diff --git a/games_march2025_cleaned/GaussianNB.txt b/games_march2025_cleaned/GaussianNB.txt
new file mode 100644
index 0000000..83d7a2e
--- /dev/null
+++ b/games_march2025_cleaned/GaussianNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.76      0.80      0.78       300
+           1       0.62      0.51      0.56       216
+           2       0.63      0.14      0.23        86
+           3       0.17      0.02      0.04        46
+           4       0.42      0.10      0.16        83
+           5       0.00      0.00      0.00         0
+           6       0.68      0.66      0.67       245
+           7       0.56      0.12      0.20        42
+           8       0.55      0.33      0.41       127
+           9       0.67      0.17      0.27        12
+          10       0.65      0.31      0.42       127
+          11       1.00      0.14      0.25        14
+          12       0.53      0.29      0.38       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.66      0.47      0.55      1404
+   macro avg       0.52      0.26      0.31      1404
+weighted avg       0.62      0.47      0.51      1404
+ samples avg       0.67      0.53      0.55      1404
diff --git a/games_march2025_cleaned/GradientBoostingClassifier.txt b/games_march2025_cleaned/GradientBoostingClassifier.txt
new file mode 100644
index 0000000..7c8ce6e
--- /dev/null
+++ b/games_march2025_cleaned/GradientBoostingClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.85      0.80      0.83       300
+           1       0.77      0.61      0.68       216
+           2       0.55      0.13      0.21        86
+           3       0.42      0.11      0.17        46
+           4       0.68      0.33      0.44        83
+           5       0.00      0.00      0.00         0
+           6       0.71      0.76      0.74       245
+           7       0.61      0.26      0.37        42
+           8       0.81      0.50      0.61       127
+           9       0.75      0.25      0.38        12
+          10       0.81      0.54      0.65       127
+          11       0.40      0.43      0.41        14
+          12       0.69      0.42      0.53       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.76      0.57      0.65      1404
+   macro avg       0.57      0.37      0.43      1404
+weighted avg       0.74      0.57      0.63      1404
+ samples avg       0.76      0.63      0.65      1404
diff --git a/games_march2025_cleaned/LinearSVC-i5000.txt b/games_march2025_cleaned/LinearSVC-i5000.txt
new file mode 100644
index 0000000..df82b40
--- /dev/null
+++ b/games_march2025_cleaned/LinearSVC-i5000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.85      0.87      0.86       300
+           1       0.76      0.66      0.70       216
+           2       0.77      0.20      0.31        86
+           3       0.00      0.00      0.00        46
+           4       0.76      0.27      0.39        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.81      0.79       245
+           7       0.89      0.19      0.31        42
+           8       0.77      0.60      0.67       127
+           9       1.00      0.58      0.74        12
+          10       0.85      0.54      0.66       127
+          11       1.00      0.29      0.44        14
+          12       0.82      0.42      0.56       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.80      0.61      0.69      1404
+   macro avg       0.66      0.39      0.46      1404
+weighted avg       0.78      0.61      0.66      1404
+ samples avg       0.81      0.67      0.69      1404
diff --git a/games_march2025_cleaned/LogisticRegression-i1000.txt b/games_march2025_cleaned/LogisticRegression-i1000.txt
new file mode 100644
index 0000000..b7926d4
--- /dev/null
+++ b/games_march2025_cleaned/LogisticRegression-i1000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.78      0.91      0.84       300
+           1       0.78      0.62      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.04      0.07        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.81      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.90      0.34      0.49       127
+           9       0.00      0.00      0.00        12
+          10       0.89      0.25      0.39       127
+          11       0.00      0.00      0.00        14
+          12       0.88      0.14      0.24       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.50      0.61      1404
+   macro avg       0.50      0.22      0.26      1404
+weighted avg       0.77      0.50      0.53      1404
+ samples avg       0.77      0.56      0.60      1404
diff --git a/games_march2025_cleaned/LogisticRegression-i10000.txt b/games_march2025_cleaned/LogisticRegression-i10000.txt
new file mode 100644
index 0000000..b7926d4
--- /dev/null
+++ b/games_march2025_cleaned/LogisticRegression-i10000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.78      0.91      0.84       300
+           1       0.78      0.62      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.04      0.07        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.81      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.90      0.34      0.49       127
+           9       0.00      0.00      0.00        12
+          10       0.89      0.25      0.39       127
+          11       0.00      0.00      0.00        14
+          12       0.88      0.14      0.24       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.50      0.61      1404
+   macro avg       0.50      0.22      0.26      1404
+weighted avg       0.77      0.50      0.53      1404
+ samples avg       0.77      0.56      0.60      1404
diff --git a/games_march2025_cleaned/MLPClassifier-i10000.txt b/games_march2025_cleaned/MLPClassifier-i10000.txt
new file mode 100644
index 0000000..c4634dc
--- /dev/null
+++ b/games_march2025_cleaned/MLPClassifier-i10000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.84      0.85      0.84       300
+           1       0.73      0.67      0.70       216
+           2       0.74      0.30      0.43        86
+           3       0.50      0.02      0.04        46
+           4       0.69      0.24      0.36        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.79      0.79       245
+           7       0.86      0.14      0.24        42
+           8       0.76      0.63      0.69       127
+           9       1.00      0.33      0.50        12
+          10       0.81      0.52      0.63       127
+          11       1.00      0.14      0.25        14
+          12       0.75      0.41      0.53       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.60      0.68      1404
+   macro avg       0.68      0.36      0.43      1404
+weighted avg       0.78      0.60      0.65      1404
+ samples avg       0.80      0.66      0.68      1404
diff --git a/games_march2025_cleaned/MultinomialNB.txt b/games_march2025_cleaned/MultinomialNB.txt
new file mode 100644
index 0000000..bc74cf3
--- /dev/null
+++ b/games_march2025_cleaned/MultinomialNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.64      0.99      0.78       300
+           1       0.85      0.24      0.37       216
+           2       0.60      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       0.80      0.05      0.09        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.80      0.79       245
+           7       0.40      0.05      0.09        42
+           8       1.00      0.04      0.08       127
+           9       0.00      0.00      0.00        12
+          10       0.20      0.01      0.02       127
+          11       0.00      0.00      0.00        14
+          12       1.00      0.05      0.09       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.69      0.40      0.51      1404
+   macro avg       0.45      0.16      0.17      1404
+weighted avg       0.68      0.40      0.39      1404
+ samples avg       0.70      0.44      0.50      1404
diff --git a/games_march2025_cleaned/RandomForestClassifier.txt b/games_march2025_cleaned/RandomForestClassifier.txt
new file mode 100644
index 0000000..6fbe546
--- /dev/null
+++ b/games_march2025_cleaned/RandomForestClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.80      0.88      0.84       300
+           1       0.78      0.55      0.64       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.06      0.11        83
+           5       0.00      0.00      0.00         0
+           6       0.74      0.78      0.76       245
+           7       0.00      0.00      0.00        42
+           8       0.84      0.24      0.38       127
+           9       0.00      0.00      0.00        12
+          10       0.91      0.24      0.38       127
+          11       1.00      0.14      0.25        14
+          12       1.00      0.25      0.39       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.48      0.59      1404
+   macro avg       0.58      0.23      0.27      1404
+weighted avg       0.78      0.48      0.52      1404
+ samples avg       0.77      0.54      0.60      1404
diff --git a/games_march2025_cleaned/SVC-RBF-i10000.txt b/games_march2025_cleaned/SVC-RBF-i10000.txt
new file mode 100644
index 0000000..ff0c7b7
--- /dev/null
+++ b/games_march2025_cleaned/SVC-RBF-i10000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.81      0.90      0.85       300
+           1       0.76      0.63      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.05      0.09        83
+           5       0.00      0.00      0.00         0
+           6       0.77      0.83      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.84      0.40      0.54       127
+           9       1.00      0.17      0.29        12
+          10       0.90      0.34      0.49       127
+          11       1.00      0.14      0.25        14
+          12       0.92      0.21      0.34       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.80      0.53      0.63      1404
+   macro avg       0.64      0.26      0.32      1404
+weighted avg       0.79      0.53      0.56      1404
+ samples avg       0.79      0.59      0.63      1404
diff --git a/games_march2025_cleaned_10k.csv b/games_march2025_cleaned_10k.csv
new file mode 100644
index 0000000..2c3c073
--- /dev/null
+++ b/games_march2025_cleaned_10k.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12cf598a6e41d83cfa9c16e99d4d9578cb4ee7c3594fae9f9b921772887a08d7
+size 68658136
diff --git a/games_march2025_cleaned_10k/BernoulliNB.txt b/games_march2025_cleaned_10k/BernoulliNB.txt
new file mode 100644
index 0000000..f2237d4
--- /dev/null
+++ b/games_march2025_cleaned_10k/BernoulliNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.75      0.90      0.82       300
+           1       0.72      0.68      0.70       216
+           2       0.50      0.08      0.14        86
+           3       0.27      0.07      0.11        46
+           4       0.40      0.07      0.12        83
+           5       0.00      0.00      0.00         0
+           6       0.77      0.82      0.79       245
+           7       0.33      0.10      0.15        42
+           8       0.67      0.40      0.50       127
+           9       0.00      0.00      0.00        12
+          10       0.71      0.37      0.49       127
+          11       0.00      0.00      0.00        14
+          12       0.49      0.31      0.38       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.70      0.55      0.62      1404
+   macro avg       0.40      0.27      0.30      1404
+weighted avg       0.64      0.55      0.56      1404
+ samples avg       0.73      0.59      0.61      1404
diff --git a/games_march2025_cleaned_10k/DecisionTreeClassifier.txt b/games_march2025_cleaned_10k/DecisionTreeClassifier.txt
new file mode 100644
index 0000000..900c256
--- /dev/null
+++ b/games_march2025_cleaned_10k/DecisionTreeClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.76      0.73      0.75       300
+           1       0.56      0.53      0.54       216
+           2       0.36      0.33      0.34        86
+           3       0.33      0.26      0.29        46
+           4       0.40      0.46      0.43        83
+           5       0.00      0.00      0.00         0
+           6       0.65      0.61      0.63       245
+           7       0.39      0.40      0.40        42
+           8       0.59      0.57      0.58       127
+           9       0.60      0.25      0.35        12
+          10       0.56      0.51      0.53       127
+          11       0.39      0.50      0.44        14
+          12       0.52      0.49      0.50       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.58      0.55      0.57      1404
+   macro avg       0.44      0.40      0.41      1404
+weighted avg       0.58      0.55      0.57      1404
+ samples avg       0.59      0.59      0.55      1404
diff --git a/games_march2025_cleaned_10k/GaussianNB.txt b/games_march2025_cleaned_10k/GaussianNB.txt
new file mode 100644
index 0000000..83d7a2e
--- /dev/null
+++ b/games_march2025_cleaned_10k/GaussianNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.76      0.80      0.78       300
+           1       0.62      0.51      0.56       216
+           2       0.63      0.14      0.23        86
+           3       0.17      0.02      0.04        46
+           4       0.42      0.10      0.16        83
+           5       0.00      0.00      0.00         0
+           6       0.68      0.66      0.67       245
+           7       0.56      0.12      0.20        42
+           8       0.55      0.33      0.41       127
+           9       0.67      0.17      0.27        12
+          10       0.65      0.31      0.42       127
+          11       1.00      0.14      0.25        14
+          12       0.53      0.29      0.38       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.66      0.47      0.55      1404
+   macro avg       0.52      0.26      0.31      1404
+weighted avg       0.62      0.47      0.51      1404
+ samples avg       0.67      0.53      0.55      1404
diff --git a/games_march2025_cleaned_10k/GradientBoostingClassifier.txt b/games_march2025_cleaned_10k/GradientBoostingClassifier.txt
new file mode 100644
index 0000000..7c8ce6e
--- /dev/null
+++ b/games_march2025_cleaned_10k/GradientBoostingClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.85      0.80      0.83       300
+           1       0.77      0.61      0.68       216
+           2       0.55      0.13      0.21        86
+           3       0.42      0.11      0.17        46
+           4       0.68      0.33      0.44        83
+           5       0.00      0.00      0.00         0
+           6       0.71      0.76      0.74       245
+           7       0.61      0.26      0.37        42
+           8       0.81      0.50      0.61       127
+           9       0.75      0.25      0.38        12
+          10       0.81      0.54      0.65       127
+          11       0.40      0.43      0.41        14
+          12       0.69      0.42      0.53       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.76      0.57      0.65      1404
+   macro avg       0.57      0.37      0.43      1404
+weighted avg       0.74      0.57      0.63      1404
+ samples avg       0.76      0.63      0.65      1404
diff --git a/games_march2025_cleaned_10k/LinearSVC-i5000.txt b/games_march2025_cleaned_10k/LinearSVC-i5000.txt
new file mode 100644
index 0000000..df82b40
--- /dev/null
+++ b/games_march2025_cleaned_10k/LinearSVC-i5000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.85      0.87      0.86       300
+           1       0.76      0.66      0.70       216
+           2       0.77      0.20      0.31        86
+           3       0.00      0.00      0.00        46
+           4       0.76      0.27      0.39        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.81      0.79       245
+           7       0.89      0.19      0.31        42
+           8       0.77      0.60      0.67       127
+           9       1.00      0.58      0.74        12
+          10       0.85      0.54      0.66       127
+          11       1.00      0.29      0.44        14
+          12       0.82      0.42      0.56       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.80      0.61      0.69      1404
+   macro avg       0.66      0.39      0.46      1404
+weighted avg       0.78      0.61      0.66      1404
+ samples avg       0.81      0.67      0.69      1404
diff --git a/games_march2025_cleaned_10k/LogisticRegression-i1000.txt b/games_march2025_cleaned_10k/LogisticRegression-i1000.txt
new file mode 100644
index 0000000..b7926d4
--- /dev/null
+++ b/games_march2025_cleaned_10k/LogisticRegression-i1000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.78      0.91      0.84       300
+           1       0.78      0.62      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.04      0.07        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.81      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.90      0.34      0.49       127
+           9       0.00      0.00      0.00        12
+          10       0.89      0.25      0.39       127
+          11       0.00      0.00      0.00        14
+          12       0.88      0.14      0.24       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.50      0.61      1404
+   macro avg       0.50      0.22      0.26      1404
+weighted avg       0.77      0.50      0.53      1404
+ samples avg       0.77      0.56      0.60      1404
diff --git a/games_march2025_cleaned_10k/LogisticRegression-i10000.txt b/games_march2025_cleaned_10k/LogisticRegression-i10000.txt
new file mode 100644
index 0000000..b7926d4
--- /dev/null
+++ b/games_march2025_cleaned_10k/LogisticRegression-i10000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.78      0.91      0.84       300
+           1       0.78      0.62      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.04      0.07        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.81      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.90      0.34      0.49       127
+           9       0.00      0.00      0.00        12
+          10       0.89      0.25      0.39       127
+          11       0.00      0.00      0.00        14
+          12       0.88      0.14      0.24       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.50      0.61      1404
+   macro avg       0.50      0.22      0.26      1404
+weighted avg       0.77      0.50      0.53      1404
+ samples avg       0.77      0.56      0.60      1404
diff --git a/games_march2025_cleaned_10k/MLPClassifier-i10000.txt b/games_march2025_cleaned_10k/MLPClassifier-i10000.txt
new file mode 100644
index 0000000..c4634dc
--- /dev/null
+++ b/games_march2025_cleaned_10k/MLPClassifier-i10000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.84      0.85      0.84       300
+           1       0.73      0.67      0.70       216
+           2       0.74      0.30      0.43        86
+           3       0.50      0.02      0.04        46
+           4       0.69      0.24      0.36        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.79      0.79       245
+           7       0.86      0.14      0.24        42
+           8       0.76      0.63      0.69       127
+           9       1.00      0.33      0.50        12
+          10       0.81      0.52      0.63       127
+          11       1.00      0.14      0.25        14
+          12       0.75      0.41      0.53       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.60      0.68      1404
+   macro avg       0.68      0.36      0.43      1404
+weighted avg       0.78      0.60      0.65      1404
+ samples avg       0.80      0.66      0.68      1404
diff --git a/games_march2025_cleaned_10k/MultinomialNB.txt b/games_march2025_cleaned_10k/MultinomialNB.txt
new file mode 100644
index 0000000..bc74cf3
--- /dev/null
+++ b/games_march2025_cleaned_10k/MultinomialNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.64      0.99      0.78       300
+           1       0.85      0.24      0.37       216
+           2       0.60      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       0.80      0.05      0.09        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.80      0.79       245
+           7       0.40      0.05      0.09        42
+           8       1.00      0.04      0.08       127
+           9       0.00      0.00      0.00        12
+          10       0.20      0.01      0.02       127
+          11       0.00      0.00      0.00        14
+          12       1.00      0.05      0.09       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.69      0.40      0.51      1404
+   macro avg       0.45      0.16      0.17      1404
+weighted avg       0.68      0.40      0.39      1404
+ samples avg       0.70      0.44      0.50      1404
diff --git a/games_march2025_cleaned_10k/RandomForestClassifier.txt b/games_march2025_cleaned_10k/RandomForestClassifier.txt
new file mode 100644
index 0000000..6fbe546
--- /dev/null
+++ b/games_march2025_cleaned_10k/RandomForestClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.80      0.88      0.84       300
+           1       0.78      0.55      0.64       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.06      0.11        83
+           5       0.00      0.00      0.00         0
+           6       0.74      0.78      0.76       245
+           7       0.00      0.00      0.00        42
+           8       0.84      0.24      0.38       127
+           9       0.00      0.00      0.00        12
+          10       0.91      0.24      0.38       127
+          11       1.00      0.14      0.25        14
+          12       1.00      0.25      0.39       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.48      0.59      1404
+   macro avg       0.58      0.23      0.27      1404
+weighted avg       0.78      0.48      0.52      1404
+ samples avg       0.77      0.54      0.60      1404
diff --git a/games_march2025_cleaned_10k/SVC-RBF-i10000.txt b/games_march2025_cleaned_10k/SVC-RBF-i10000.txt
new file mode 100644
index 0000000..ff0c7b7
--- /dev/null
+++ b/games_march2025_cleaned_10k/SVC-RBF-i10000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.81      0.90      0.85       300
+           1       0.76      0.63      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.05      0.09        83
+           5       0.00      0.00      0.00         0
+           6       0.77      0.83      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.84      0.40      0.54       127
+           9       1.00      0.17      0.29        12
+          10       0.90      0.34      0.49       127
+          11       1.00      0.14      0.25        14
+          12       0.92      0.21      0.34       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.80      0.53      0.63      1404
+   macro avg       0.64      0.26      0.32      1404
+weighted avg       0.79      0.53      0.56      1404
+ samples avg       0.79      0.59      0.63      1404
diff --git a/games_march2025_cleaned_2k.csv b/games_march2025_cleaned_2k.csv
new file mode 100644
index 0000000..806e982
--- /dev/null
+++ b/games_march2025_cleaned_2k.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75ba38404995149bcb8e5a321459f73b4adf58597f85bab396dd054cc78c145d
+size 15455174
diff --git a/games_march2025_cleaned_2k/BernoulliNB.txt b/games_march2025_cleaned_2k/BernoulliNB.txt
new file mode 100644
index 0000000..f2237d4
--- /dev/null
+++ b/games_march2025_cleaned_2k/BernoulliNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.75      0.90      0.82       300
+           1       0.72      0.68      0.70       216
+           2       0.50      0.08      0.14        86
+           3       0.27      0.07      0.11        46
+           4       0.40      0.07      0.12        83
+           5       0.00      0.00      0.00         0
+           6       0.77      0.82      0.79       245
+           7       0.33      0.10      0.15        42
+           8       0.67      0.40      0.50       127
+           9       0.00      0.00      0.00        12
+          10       0.71      0.37      0.49       127
+          11       0.00      0.00      0.00        14
+          12       0.49      0.31      0.38       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.70      0.55      0.62      1404
+   macro avg       0.40      0.27      0.30      1404
+weighted avg       0.64      0.55      0.56      1404
+ samples avg       0.73      0.59      0.61      1404
diff --git a/games_march2025_cleaned_2k/DecisionTreeClassifier.txt b/games_march2025_cleaned_2k/DecisionTreeClassifier.txt
new file mode 100644
index 0000000..900c256
--- /dev/null
+++ b/games_march2025_cleaned_2k/DecisionTreeClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.76      0.73      0.75       300
+           1       0.56      0.53      0.54       216
+           2       0.36      0.33      0.34        86
+           3       0.33      0.26      0.29        46
+           4       0.40      0.46      0.43        83
+           5       0.00      0.00      0.00         0
+           6       0.65      0.61      0.63       245
+           7       0.39      0.40      0.40        42
+           8       0.59      0.57      0.58       127
+           9       0.60      0.25      0.35        12
+          10       0.56      0.51      0.53       127
+          11       0.39      0.50      0.44        14
+          12       0.52      0.49      0.50       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.58      0.55      0.57      1404
+   macro avg       0.44      0.40      0.41      1404
+weighted avg       0.58      0.55      0.57      1404
+ samples avg       0.59      0.59      0.55      1404
diff --git a/games_march2025_cleaned_2k/GaussianNB.txt b/games_march2025_cleaned_2k/GaussianNB.txt
new file mode 100644
index 0000000..83d7a2e
--- /dev/null
+++ b/games_march2025_cleaned_2k/GaussianNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.76      0.80      0.78       300
+           1       0.62      0.51      0.56       216
+           2       0.63      0.14      0.23        86
+           3       0.17      0.02      0.04        46
+           4       0.42      0.10      0.16        83
+           5       0.00      0.00      0.00         0
+           6       0.68      0.66      0.67       245
+           7       0.56      0.12      0.20        42
+           8       0.55      0.33      0.41       127
+           9       0.67      0.17      0.27        12
+          10       0.65      0.31      0.42       127
+          11       1.00      0.14      0.25        14
+          12       0.53      0.29      0.38       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.66      0.47      0.55      1404
+   macro avg       0.52      0.26      0.31      1404
+weighted avg       0.62      0.47      0.51      1404
+ samples avg       0.67      0.53      0.55      1404
diff --git a/games_march2025_cleaned_2k/GradientBoostingClassifier.txt b/games_march2025_cleaned_2k/GradientBoostingClassifier.txt
new file mode 100644
index 0000000..7c8ce6e
--- /dev/null
+++ b/games_march2025_cleaned_2k/GradientBoostingClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.85      0.80      0.83       300
+           1       0.77      0.61      0.68       216
+           2       0.55      0.13      0.21        86
+           3       0.42      0.11      0.17        46
+           4       0.68      0.33      0.44        83
+           5       0.00      0.00      0.00         0
+           6       0.71      0.76      0.74       245
+           7       0.61      0.26      0.37        42
+           8       0.81      0.50      0.61       127
+           9       0.75      0.25      0.38        12
+          10       0.81      0.54      0.65       127
+          11       0.40      0.43      0.41        14
+          12       0.69      0.42      0.53       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.76      0.57      0.65      1404
+   macro avg       0.57      0.37      0.43      1404
+weighted avg       0.74      0.57      0.63      1404
+ samples avg       0.76      0.63      0.65      1404
diff --git a/games_march2025_cleaned_2k/LinearSVC-i5000.txt b/games_march2025_cleaned_2k/LinearSVC-i5000.txt
new file mode 100644
index 0000000..df82b40
--- /dev/null
+++ b/games_march2025_cleaned_2k/LinearSVC-i5000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.85      0.87      0.86       300
+           1       0.76      0.66      0.70       216
+           2       0.77      0.20      0.31        86
+           3       0.00      0.00      0.00        46
+           4       0.76      0.27      0.39        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.81      0.79       245
+           7       0.89      0.19      0.31        42
+           8       0.77      0.60      0.67       127
+           9       1.00      0.58      0.74        12
+          10       0.85      0.54      0.66       127
+          11       1.00      0.29      0.44        14
+          12       0.82      0.42      0.56       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.80      0.61      0.69      1404
+   macro avg       0.66      0.39      0.46      1404
+weighted avg       0.78      0.61      0.66      1404
+ samples avg       0.81      0.67      0.69      1404
diff --git a/games_march2025_cleaned_2k/LogisticRegression-i1000.txt b/games_march2025_cleaned_2k/LogisticRegression-i1000.txt
new file mode 100644
index 0000000..b7926d4
--- /dev/null
+++ b/games_march2025_cleaned_2k/LogisticRegression-i1000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.78      0.91      0.84       300
+           1       0.78      0.62      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.04      0.07        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.81      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.90      0.34      0.49       127
+           9       0.00      0.00      0.00        12
+          10       0.89      0.25      0.39       127
+          11       0.00      0.00      0.00        14
+          12       0.88      0.14      0.24       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.50      0.61      1404
+   macro avg       0.50      0.22      0.26      1404
+weighted avg       0.77      0.50      0.53      1404
+ samples avg       0.77      0.56      0.60      1404
diff --git a/games_march2025_cleaned_2k/LogisticRegression-i10000.txt b/games_march2025_cleaned_2k/LogisticRegression-i10000.txt
new file mode 100644
index 0000000..b7926d4
--- /dev/null
+++ b/games_march2025_cleaned_2k/LogisticRegression-i10000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.78      0.91      0.84       300
+           1       0.78      0.62      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.04      0.07        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.81      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.90      0.34      0.49       127
+           9       0.00      0.00      0.00        12
+          10       0.89      0.25      0.39       127
+          11       0.00      0.00      0.00        14
+          12       0.88      0.14      0.24       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.50      0.61      1404
+   macro avg       0.50      0.22      0.26      1404
+weighted avg       0.77      0.50      0.53      1404
+ samples avg       0.77      0.56      0.60      1404
diff --git a/games_march2025_cleaned_2k/MLPClassifier-i10000.txt b/games_march2025_cleaned_2k/MLPClassifier-i10000.txt
new file mode 100644
index 0000000..c4634dc
--- /dev/null
+++ b/games_march2025_cleaned_2k/MLPClassifier-i10000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.84      0.85      0.84       300
+           1       0.73      0.67      0.70       216
+           2       0.74      0.30      0.43        86
+           3       0.50      0.02      0.04        46
+           4       0.69      0.24      0.36        83
+           5       0.00      0.00      0.00         0
+           6       0.79      0.79      0.79       245
+           7       0.86      0.14      0.24        42
+           8       0.76      0.63      0.69       127
+           9       1.00      0.33      0.50        12
+          10       0.81      0.52      0.63       127
+          11       1.00      0.14      0.25        14
+          12       0.75      0.41      0.53       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.60      0.68      1404
+   macro avg       0.68      0.36      0.43      1404
+weighted avg       0.78      0.60      0.65      1404
+ samples avg       0.80      0.66      0.68      1404
diff --git a/games_march2025_cleaned_2k/MultinomialNB.txt b/games_march2025_cleaned_2k/MultinomialNB.txt
new file mode 100644
index 0000000..bc74cf3
--- /dev/null
+++ b/games_march2025_cleaned_2k/MultinomialNB.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.64      0.99      0.78       300
+           1       0.85      0.24      0.37       216
+           2       0.60      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       0.80      0.05      0.09        83
+           5       0.00      0.00      0.00         0
+           6       0.78      0.80      0.79       245
+           7       0.40      0.05      0.09        42
+           8       1.00      0.04      0.08       127
+           9       0.00      0.00      0.00        12
+          10       0.20      0.01      0.02       127
+          11       0.00      0.00      0.00        14
+          12       1.00      0.05      0.09       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.69      0.40      0.51      1404
+   macro avg       0.45      0.16      0.17      1404
+weighted avg       0.68      0.40      0.39      1404
+ samples avg       0.70      0.44      0.50      1404
diff --git a/games_march2025_cleaned_2k/RandomForestClassifier.txt b/games_march2025_cleaned_2k/RandomForestClassifier.txt
new file mode 100644
index 0000000..6fbe546
--- /dev/null
+++ b/games_march2025_cleaned_2k/RandomForestClassifier.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.80      0.88      0.84       300
+           1       0.78      0.55      0.64       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.06      0.11        83
+           5       0.00      0.00      0.00         0
+           6       0.74      0.78      0.76       245
+           7       0.00      0.00      0.00        42
+           8       0.84      0.24      0.38       127
+           9       0.00      0.00      0.00        12
+          10       0.91      0.24      0.38       127
+          11       1.00      0.14      0.25        14
+          12       1.00      0.25      0.39       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.79      0.48      0.59      1404
+   macro avg       0.58      0.23      0.27      1404
+weighted avg       0.78      0.48      0.52      1404
+ samples avg       0.77      0.54      0.60      1404
diff --git a/games_march2025_cleaned_2k/SVC-RBF-i10000.txt b/games_march2025_cleaned_2k/SVC-RBF-i10000.txt
new file mode 100644
index 0000000..ff0c7b7
--- /dev/null
+++ b/games_march2025_cleaned_2k/SVC-RBF-i10000.txt
@@ -0,0 +1,21 @@
+              precision    recall  f1-score   support
+
+           0       0.81      0.90      0.85       300
+           1       0.76      0.63      0.69       216
+           2       1.00      0.03      0.07        86
+           3       0.00      0.00      0.00        46
+           4       1.00      0.05      0.09        83
+           5       0.00      0.00      0.00         0
+           6       0.77      0.83      0.80       245
+           7       0.00      0.00      0.00        42
+           8       0.84      0.40      0.54       127
+           9       1.00      0.17      0.29        12
+          10       0.90      0.34      0.49       127
+          11       1.00      0.14      0.25        14
+          12       0.92      0.21      0.34       106
+          13       0.00      0.00      0.00         0
+
+   micro avg       0.80      0.53      0.63      1404
+   macro avg       0.64      0.26      0.32      1404
+weighted avg       0.79      0.53      0.56      1404
+ samples avg       0.79      0.59      0.63      1404
diff --git a/notebook.ipynb b/notebook.ipynb
new file mode 100644
index 0000000..3307ceb
--- /dev/null
+++ b/notebook.ipynb
@@ -0,0 +1,530 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a3a7634f",
+   "metadata": {},
+   "source": [
+    "# Machine Learning project in SoSe 2025 at HTW Saar\n",
+    "## Idea\n",
+    "The goal of this project is predicting the genre(s) of a game/bundle through its given description(s)\n",
+    "\n",
+    "## Dataset\n",
+    "For our project we use a Steam Dataset provided on moodle, since it has all information we plan on using.\n",
+    "The Dataset has been cut to only 2000 data points to be runnable on weaker devices."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3116b75f",
+   "metadata": {
+    "jupyter": {
+     "is_executing": true
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   appid              name release_date  required_age  price  dlc_count  \\\n",
+      "0    730  Counter-Strike 2   2012-08-21             0    0.0          1   \n",
+      "\n",
+      "                                detailed_description  \\\n",
+      "0  For over two decades, Counter-Strike has offer...   \n",
+      "\n",
+      "                                      about_the_game  \\\n",
+      "0  For over two decades, Counter-Strike has offer...   \n",
+      "\n",
+      "                                   short_description reviews  ...  \\\n",
+      "0  For over two decades, Counter-Strike has offer...     NaN  ...   \n",
+      "\n",
+      "  average_playtime_2weeks median_playtime_forever median_playtime_2weeks  \\\n",
+      "0                     879                    5174                    350   \n",
+      "\n",
+      "  discount  peak_ccu                                               tags  \\\n",
+      "0        0   1212356  {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...   \n",
+      "\n",
+      "   pct_pos_total  num_reviews_total pct_pos_recent  num_reviews_recent  \n",
+      "0             86            8632939             82               96473  \n",
+      "\n",
+      "[1 rows x 47 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn import set_config\n",
+    "\n",
+    "set_config(transform_output=\"pandas\")\n",
+    "\n",
+    "dataset = pd.read_csv(\"./games_march2025_cleaned_2k.csv\",sep=\",\")\n",
+    "print(dataset.head(1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cba9750a",
+   "metadata": {},
+   "source": [
+    "## Preparation of the Dataset\n",
+    "### Removing Uniques\n",
+    "We would remove the following features from the Training-Set as they can/could uniquely identify a datapoint, but we don't as they will be removed in the next step anyway\n",
+    "- AppId\n",
+    "- Name of the Game\n",
+    "- Realease Date\n",
+    "- Reviews\n",
+    "- Header Image\n",
+    "- Website\n",
+    "- Support URL\n",
+    "- Support Email\n",
+    "- MetaCritic URL\n",
+    "- Developer\n",
+    "- Publisher\n",
+    "- Screenshots\n",
+    "- Movies\n",
+    "- Estimated Owners"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d159117377f3633c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n",
+    "#print(dataset.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1b28ddd69f1e9a6",
+   "metadata": {},
+   "source": [
+    "## Hold onto necessary information\n",
+    "Our model should turn a textual description of a game into its genre. For that we need all the textual information a game has, as well as the genres of the game.\n",
+    "We use a ColumnTransformer to drop all unnecessary lines, merge all descriptions of a game into one big description and hold onto the genres\n",
+    "\n",
+    "It is important to use ``verbose_feature_names_out=False`` so the feature names don't get changed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "986fbb31a7ae0d8b",
+   "metadata": {
+    "jupyter": {
+     "is_executing": true
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                                desc  \\\n",
+      "0  For over two decades, Counter-Strike has offer...   \n",
+      "1  LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...   \n",
+      "2  The most-played game on Steam. Every day, mill...   \n",
+      "3  When a young street hustler, a retired bank ro...   \n",
+      "4  Edition Comparison Ultimate Edition The Tom Cl...   \n",
+      "\n",
+      "                                              genres  \n",
+      "0                         ['Action', 'Free To Play']  \n",
+      "1  ['Action', 'Adventure', 'Massively Multiplayer...  \n",
+      "2             ['Action', 'Strategy', 'Free To Play']  \n",
+      "3                            ['Action', 'Adventure']  \n",
+      "4                                         ['Action']  \n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.preprocessing import FunctionTransformer\n",
+    "\n",
+    "# desc, genres\n",
+    "column_transformer = ColumnTransformer([\n",
+    "        # merge all descriptions\n",
+    "        ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name=\"desc\")),\n",
+    "            ['detailed_description', 'about_the_game', 'short_description']),\n",
+    "        ('pass', 'passthrough', ['genres']),\n",
+    "    ],\n",
+    "    verbose_feature_names_out=False\n",
+    ")\n",
+    "dataset = column_transformer.fit_transform(dataset)\n",
+    "print(dataset.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9b89c0645811564",
+   "metadata": {},
+   "source": [
+    "### Adding missing Information\n",
+    "Some Games might not have any descriptions. For these we Input an Empty String\n",
+    "**TODO: check if dropna and fillna numeric_only is needed, as we dont have any numbers**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44239f6b7fd23cde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# missing numeric values => mean\n",
+    "dataset.fillna(dataset.mean(numeric_only=True), inplace=True)\n",
+    "# missing strings => empty string?\n",
+    "dataset.fillna('', inplace=True)\n",
+    "# drop all lines with missing values\n",
+    "dataset.dropna(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca5b59b9fa8160a0",
+   "metadata": {},
+   "source": [
+    "## Transform Genres\n",
+    "The genre information currently is a string holding a python array of genres. While this is machine-readable, we need One-Hot-Encoding for our model to work.\n",
+    "\n",
+    "#### Serializing the String-Array\n",
+    "The \"ast\" library can interpret python strings as python code, and as such will be used for serializing the genres."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebc5a24e9bc87fdd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0                               [Action, Free To Play]\n",
+      "1    [Action, Adventure, Massively Multiplayer, Fre...\n",
+      "2                     [Action, Strategy, Free To Play]\n",
+      "3                                  [Action, Adventure]\n",
+      "4                                             [Action]\n",
+      "Name: genres, dtype: object\n"
+     ]
+    }
+   ],
+   "source": [
+    "import ast\n",
+    "\n",
+    "dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))\n",
+    "print(dataset['genres'].head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f90756f9ad9211f4",
+   "metadata": {},
+   "source": [
+    "#### One-Hot-Encoding an Python-Array\n",
+    "The sklearn ``OneHotEncoder()`` is only able to work with an 1D Array of different classes, such as ``['Politics', 'Sport', 'Culture']``. Every datapoint can only have one concurrent classification.\n",
+    "Steam allows an app/bundle to have multiple genres. As such, our dataset has an 2D Array of different classes, which sklearn's ``MultiLabelBinarizer()`` does support."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2c3527a5fc876bf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Action  Adventure  Casual  Early Access  Free To Play  Gore  Indie  \\\n",
+      "0       1          0       0             0             1     0      0   \n",
+      "1       1          1       0             0             1     0      0   \n",
+      "2       1          0       0             0             1     0      0   \n",
+      "3       1          1       0             0             0     0      0   \n",
+      "4       1          0       0             0             0     0      0   \n",
+      "\n",
+      "   Massively Multiplayer  RPG  Racing  Simulation  Sports  Strategy  Violent  \n",
+      "0                      0    0       0           0       0         0        0  \n",
+      "1                      1    0       0           0       0         0        0  \n",
+      "2                      0    0       0           0       0         1        0  \n",
+      "3                      0    0       0           0       0         0        0  \n",
+      "4                      0    0       0           0       0         0        0  \n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.preprocessing import MultiLabelBinarizer\n",
+    "\n",
+    "mlb_genres = MultiLabelBinarizer()\n",
+    "genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))\n",
+    "genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)\n",
+    "print(genres_df.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "671c01f9f4ae66d9",
+   "metadata": {},
+   "source": [
+    "With this, our target matrix is completed."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5436c87",
+   "metadata": {},
+   "source": [
+    "### Structurizing Text\n",
+    "If we want our Model to be able to use text as an input, we have to vectorize the text. TF-IDF (Inverse Document Frequency) is an easy way of transforming each word into a feature with a 0 to 1 value. **TODO: filter out stopwords**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e8b407c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "    00  000  000km    000th  00am  00f  00i  00p  00v   01  ...  이터널  이터널리턴  \\\n",
+      "0  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
+      "1  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
+      "2  0.0  0.0    0.0  0.14649   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
+      "3  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
+      "4  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
+      "\n",
+      "   이현준  정대찬  중입니다   철권  토탈워  페르소나  한국어  한글을  \n",
+      "0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
+      "1  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
+      "2  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
+      "3  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
+      "4  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
+      "\n",
+      "[5 rows x 29351 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix, not pandas df\n",
+    "tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())\n",
+    "print(tfidf_df.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad84e777",
+   "metadata": {},
+   "source": [
+    "With this our feature matrix is completed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86d9da42f4df8e49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = tfidf_df\n",
+    "y = genres_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aeb782668f311cd8",
+   "metadata": {},
+   "source": [
+    "## The Model\n",
+    "\n",
+    "####  Removing unpredicatble Datapoints\n",
+    "Some Datapoints don't have a genre assigned (all feature values in y are 0). The model we use can't handle such cases, thus they have to be removed.\n",
+    "We filter after all values that we can use with a mask, and apply that mask to our matrices."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4919bf1b37d171a7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "13\n"
+     ]
+    }
+   ],
+   "source": [
+    "mask = y.sum(axis=1).map(lambda x: x > 0)\n",
+    "print((mask == False).sum()) # count of unpredictable datapoints\n",
+    "\n",
+    "X_clean = X[mask]\n",
+    "y_clean = y[mask]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "091d7e13",
+   "metadata": {},
+   "source": [
+    "# Splitting up data\n",
+    "We have to split up our data into training and testing data.\n",
+    "Using random_state=0 guarantees reproducability."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfbf3787",
+   "metadata": {
+    "jupyter": {
+     "is_executing": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12b5283d",
+   "metadata": {},
+   "source": [
+    "# Model Selection\n",
+    "**TODO Deciding which model to use for this task**\n",
+    "\n",
+    "As a game can have multiple genres, our Model(s) has to be capable of multi-label-classification. sklearn's ``MultiOutputClassifier`` can do this. As a backend for ``MultiOutputClassifier`` we use ``LogisticRegression``"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c1d72c4532bd509",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.multioutput import MultiOutputClassifier\n",
+    "\n",
+    "# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is to bad)\n",
+    "multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)\n",
+    "\n",
+    "multi_target_clf.fit(X_train, y_train)\n",
+    "\n",
+    "y_pred = multi_target_clf.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0faa9856",
+   "metadata": {},
+   "source": [
+    "# Evaluation\n",
+    "**TODO Test the Model with the test data**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2ebea6945193e07",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.78      0.91      0.84       300\n",
+      "           1       0.78      0.62      0.69       216\n",
+      "           2       1.00      0.03      0.07        86\n",
+      "           3       0.00      0.00      0.00        46\n",
+      "           4       1.00      0.04      0.07        83\n",
+      "           5       0.00      0.00      0.00         0\n",
+      "           6       0.79      0.81      0.80       245\n",
+      "           7       0.00      0.00      0.00        42\n",
+      "           8       0.90      0.34      0.49       127\n",
+      "           9       0.00      0.00      0.00        12\n",
+      "          10       0.89      0.25      0.39       127\n",
+      "          11       0.00      0.00      0.00        14\n",
+      "          12       0.88      0.14      0.24       106\n",
+      "          13       0.00      0.00      0.00         0\n",
+      "\n",
+      "   micro avg       0.79      0.50      0.61      1404\n",
+      "   macro avg       0.50      0.22      0.26      1404\n",
+      "weighted avg       0.77      0.50      0.53      1404\n",
+      " samples avg       0.77      0.56      0.60      1404\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import classification_report\n",
+    "\n",
+    "print(classification_report(y_test, y_pred, zero_division=0.0))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2aeb6fc2",
+   "metadata": {},
+   "source": [
+    "# Optimization\n",
+    "**TODO optimize the model based on the test results**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79b20645",
+   "metadata": {},
+   "source": [
+    "# Validation\n",
+    "**TODO Predict actual values**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3b709fb7",
+   "metadata": {},
+   "source": [
+    "# Conclusion and outlook\n",
+    "**TODO Write a conclusion and outlook what can be done and where the issues were.**"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/test_script.py b/test_script.py
new file mode 100644
index 0000000..de7e833
--- /dev/null
+++ b/test_script.py
@@ -0,0 +1,133 @@
+
+
+#### INITIALIZE
+
+import numpy as np
+import pandas as pd
+from sklearn import set_config
+set_config(transform_output="pandas") # dataframe supremacy
+
+# load data
+# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
+dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",")
+print(dataset.head())
+
+
+
+
+#### DROP UNIQUES
+print("DROP")
+
+#TODO: wird eh unten beim transformer deleted
+
+# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
+#dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email',
+#              'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'],
+#              axis=1, inplace=True)
+#print(dataset.head())
+
+#### STRUCTURIZE AND STANDARDIZE
+print("STRUCTURE")
+
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import FunctionTransformer
+
+
+# desc, genres, tags
+column_transformer = ColumnTransformer([
+        # merge all descriptions
+        ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
+            ['detailed_description', 'about_the_game', 'short_description']),
+        # genre -> actual genre, but very coarse
+        # tags -> user defined tags; title num list
+        #TODO: decide whether we drop tags
+        ('pass', 'passthrough', ['genres']),#, 'tags'
+    ],
+    verbose_feature_names_out=False
+)
+dataset = column_transformer.fit_transform(dataset)
+print(dataset)
+
+
+
+#### SET MISSING VALUES
+print("SETMISS")
+
+
+# Setting missing numeric values to the mean
+dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
+# Setting missing text values to 'Unknown'
+dataset.fillna('', inplace=True)
+# Setting missing values in other columns to NaN
+dataset.dropna(inplace=True)
+
+
+
+
+##### STRUCTURIZE GENRES to onehot
+from sklearn.preprocessing import MultiLabelBinarizer
+import ast
+#serialize array
+dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) 
+print(dataset['genres']) # in py but not yet onehotenc
+
+# MultiLabelBinarizer does onehotenc for arrays
+mlb_genres = MultiLabelBinarizer()
+genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
+genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
+
+genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
+print(genres_df)
+#dataset = pd.concat([dataset, genres_df], axis=1)
+#print(dataset)
+
+
+#### convert text to bag of words
+
+## Count vs Tfidf vectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+vectorizer = TfidfVectorizer()
+tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
+tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
+print(tfidf_df)
+
+
+##### MODEL
+print("MODEL")
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.multioutput import MultiOutputClassifier
+from sklearn.metrics import classification_report
+
+
+X = tfidf_df
+y = genres_df
+
+
+# cleanup datapoints that dont have a target value (all target columns are 0)
+mask = y.sum(axis=1).map(lambda x: x > 0)
+#print((mask == False).sum()) #31 cases with all target columns 0
+X_clean = X[mask]
+y_clean = y[mask]
+
+# Split dataset
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)
+
+
+# we want to have multiple possible outputs (multi-label-classficiation) -> multioutputclassifier
+# logi regression is our base system
+# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is too bad)
+multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)
+
+# model training
+multi_target_clf.fit(X_train, y_train)
+
+# predict against test data
+y_pred = multi_target_clf.predict(X_test)
+
+# print prec, recall, f1 etc
+print(classification_report(y_test, y_pred, zero_division=0.0))
+
+
+#print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")