Add newest html and pdf

Doppelt hält besser
Add Contributors
2025-08-26 21:57:43 +02:00 · 2025-08-25 23:25:56 +02:00 · 2025-08-25 23:24:52 +02:00 · 2025-08-25 23:18:39 +02:00 · 2025-08-25 22:10:52 +02:00 · 2025-08-25 21:45:04 +02:00
6 changed files with 8508 additions and 205 deletions
--- a/Machine-Learning.html
+++ b/Machine-Learning.html
--- a/Machine-Learning.ipynb
+++ b/Machine-Learning.ipynb
@@ -6,6 +6,12 @@
   "metadata": {},
   "source": [
    "# Machine Learning project in SoSe 2025 at HTW Saar\n",
    "\n",
    "## Contributors\n",
    "- Maximilian Kany (5016118)\n",
    "- Florian Speicher (5014185)\n",
    "- Tim Wall (5014365)\n",
    "\n",
    "## Idea\n",
    "The goal of this project is predicting the genre(s) of a game/bundle through its given description(s)\n",
    "\n",
@@ -16,7 +22,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "id": "3116b75f",
   "metadata": {
    "jupyter": {
@@ -90,7 +96,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "id": "d159117377f3633c",
   "metadata": {},
   "outputs": [],
@@ -113,7 +119,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "id": "986fbb31a7ae0d8b",
   "metadata": {
    "jupyter": {
@@ -164,13 +170,12 @@
   "metadata": {},
   "source": [
    "### Adding missing Information\n",
-    "Some Games might not have any descriptions. For these we Input an Empty String\n",
+    "Some Games might not have any descriptions. For these we Input an Empty String."
    "**TODO: check if dropna and fillna numeric_only is needed, as we dont have any numbers**"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "id": "44239f6b7fd23cde",
   "metadata": {},
   "outputs": [],
@@ -197,7 +202,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "id": "ebc5a24e9bc87fdd",
   "metadata": {},
   "outputs": [
@@ -233,7 +238,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "id": "d2c3527a5fc876bf",
   "metadata": {},
   "outputs": [
@@ -280,12 +285,12 @@
   "metadata": {},
   "source": [
    "### Structurizing Text\n",
-    "If we want our Model to be able to use text as an input, we have to vectorize the text. TF-IDF (Inverse Document Frequency) is an easy way of transforming each word into a feature with a 0 to 1 value. **TODO: filter out stopwords**"
+    "If we want our Model to be able to use text as an input, we have to vectorize the text. TF-IDF (Inverse Document Frequency) is an easy way of transforming each word into a feature with a 0 to 1 value."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "id": "4e8b407c",
   "metadata": {},
   "outputs": [
@@ -293,12 +298,12 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "    00  000  000km    000th  00am  00f  00i  00p  00v   01  ...  이터널  이터널리턴  \\\n",
+      "    00  000  000km     000th  00am  00f  00i  00p  00v   01  ...  이터널  이터널리턴  \\\n",
-      "0  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
+      "0  0.0  0.0    0.0  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
-      "1  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
+      "1  0.0  0.0    0.0  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
-      "2  0.0  0.0    0.0  0.14649   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
+      "2  0.0  0.0    0.0  0.162349   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
-      "3  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
+      "3  0.0  0.0    0.0  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
-      "4  0.0  0.0    0.0  0.00000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
+      "4  0.0  0.0    0.0  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0    0.0   \n",
      "\n",
      "   이현준  정대찬  중입니다   철권  토탈워  페르소나  한국어  한글을  \n",
      "0  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
@@ -307,14 +312,14 @@
      "3  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
      "4  0.0  0.0   0.0  0.0  0.0   0.0  0.0  0.0  \n",
      "\n",
-      "[5 rows x 29351 columns]\n"
+      "[5 rows x 29056 columns]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
-    "vectorizer = TfidfVectorizer()\n",
+    "vectorizer = TfidfVectorizer(stop_words='english')\n",
    "tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix, not pandas df\n",
    "tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())\n",
    "print(tfidf_df.head())"
@@ -330,7 +335,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "id": "86d9da42f4df8e49",
   "metadata": {},
   "outputs": [],
@@ -353,7 +358,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "id": "e1bc73d4",
   "metadata": {},
   "outputs": [
@@ -385,7 +390,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "id": "4919bf1b37d171a7",
   "metadata": {},
   "outputs": [
@@ -417,7 +422,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "id": "cfbf3787",
   "metadata": {
    "jupyter": {
@@ -441,17 +446,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "id": "0b0a46a4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "99"
+       "82"
      ]
     },
-     "execution_count": null,
+     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -470,7 +475,6 @@
    "\n",
    "# preparation of X\n",
    "del tfidf_df\n",
    "del vectorizer\n",
    "del tfidf_matrix\n",
    "\n",
    "# Initial Dataset\n",
@@ -557,16 +561,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "id": "8c1d72c4532bd509",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stderr",
     "output_type": "stream",
     "text": []
    }
   ],
   "source": [
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.multioutput import MultiOutputClassifier\n",
@@ -584,12 +582,21 @@
   "metadata": {},
   "source": [
    "# Evaluation\n",
-    "**TODO Test the Model with the test data**"
+    "We evaluate our model by comparing the test data with the predicted data. We are using the worst case scenario by setting zero_division=0.0 in the classification report. This means that if a metric cannot be calculated due to division by zero, it is set to 0.0. Setting this parameter to 1.0 (best case) does not significantly change the results.\n",
    "\n",
    "Our approach involves training one model per genre, resulting in a total of 12 models for the 2k dataset. Each model predicts a specific genre, and the combined results of all models are shown at the bottom of the report. The input features are represented by X, and the output labels by y.\n",
    "\n",
    "Key metrics such as precision and recall are calculated for each class. These metrics indicate whether all classes are recognized and how accurate the predictions are. Notably, two classes achieve perfect 1.0 precision. For some reason, the Early Access class performs particularly poorly. The F1 score is also included in the evaluation, as it provides a balanced measure of precision and recall. The support column indicates the number of samples for each class.\n",
    "\n",
    "It is noteworthy that some of the top 10 words influencing the decision process are related to brands or game names, such as \"vrchat\" in Early Access and Sports, \"vermintide\" in Indie, in \"ea\" in Sports, since the description was not cleaned of developer, publisher and game names. Some words, like \"brokkoli\" in Racing, are not obviously related to the genre, which may indicate overfitting or (much more likely) the presence of only a few fitting data points in the dataset. Generally, all classes with less than 100 datapoints seem to have a very low recall but very high precision.\n",
    "\n",
    "A model is considered very good with an F1 score above 0.8, and good with a score above 0.7. In our case, the F1 micro and macro scores are 0.69 and 0.54, which means our model performs decent up to good. The low macro scores are mainly due to problematic classes, but overall, the weighted average and samples average are quite acceptable for a dataset of this size.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "id": "e2ebea6945193e07",
   "metadata": {},
   "outputs": [
@@ -597,25 +604,61 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "              precision    recall  f1-score   support\n",
+      "                       precision    recall  f1-score   support\n",
      "\n",
-      "           0       0.84      0.86      0.85       300\n",
+      "               Action       0.86      0.87      0.87       300\n",
-      "           1       0.74      0.63      0.68       216\n",
+      "            Adventure       0.74      0.66      0.70       216\n",
-      "           2       0.77      0.31      0.45        86\n",
+      "               Casual       0.79      0.22      0.35        86\n",
-      "           3       0.50      0.04      0.08        46\n",
+      "         Early Access       0.50      0.02      0.04        46\n",
-      "           4       0.69      0.33      0.44        83\n",
+      "         Free To Play       0.79      0.28      0.41        83\n",
-      "           5       0.79      0.80      0.79       245\n",
+      "                Indie       0.77      0.81      0.79       245\n",
-      "           6       0.69      0.26      0.38        42\n",
+      "Massively Multiplayer       0.89      0.19      0.31        42\n",
-      "           7       0.74      0.62      0.68       127\n",
+      "                  RPG       0.80      0.55      0.65       127\n",
-      "           8       1.00      0.67      0.80        12\n",
+      "               Racing       1.00      0.58      0.74        12\n",
-      "           9       0.80      0.57      0.67       127\n",
+      "           Simulation       0.86      0.50      0.64       127\n",
-      "          10       1.00      0.50      0.67        14\n",
+      "               Sports       1.00      0.29      0.44        14\n",
-      "          11       0.79      0.46      0.58       106\n",
+      "             Strategy       0.80      0.41      0.54       106\n",
      "\n",
-      "   micro avg       0.79      0.62      0.69      1404\n",
+      "            micro avg       0.81      0.60      0.69      1404\n",
-      "   macro avg       0.78      0.51      0.59      1404\n",
+      "            macro avg       0.82      0.45      0.54      1404\n",
-      "weighted avg       0.77      0.62      0.67      1404\n",
+      "         weighted avg       0.80      0.60      0.65      1404\n",
-      " samples avg       0.80      0.68      0.70      1404\n",
+      "          samples avg       0.81      0.66      0.69      1404\n",
      "\n",
      "Most important words of class 'Action':\n",
      "['action', 'weapons', 'shooter', 'fighting', 'fight', 'weapon', 'players', 'aim', 'gun', 'intense']\n",
      "\n",
      "Most important words of class 'Adventure':\n",
      "['adventure', 'explore', 'puzzles', 'smite', 'far', 'stories', 'remake', 'hunting', 'don', 'secrets']\n",
      "\n",
      "Most important words of class 'Casual':\n",
      "['puzzle', 'color', 'ball', 'smite', 'poker', 'click', 'communication', 'idle', 'cats', 'fun']\n",
      "\n",
      "Most important words of class 'Early Access':\n",
      "['early', 'pals', 'backrooms', 'automation', 'rotwood', 'access', 'design', 'vrchat', 'nephelym', 'idleon']\n",
      "\n",
      "Most important words of class 'Free To Play':\n",
      "['free', 'royale', 'mmo', 'pvp', 'arena', 'mmorpg', 'idle', 'cats', 'millions', 'team']\n",
      "\n",
      "Most important words of class 'Indie':\n",
      "['game', 'horror', 'building', 'different', 'vermintide', 'generated', 'roguelike', 'better', 'soundtrack', 'procedurally']\n",
      "\n",
      "Most important words of class 'Massively Multiplayer':\n",
      "['royale', 'mmorpg', 'players', 'mmo', 'pvp', 'ball', 'smite', 'scp', 'temtem', 'join']\n",
      "\n",
      "Most important words of class 'RPG':\n",
      "['rpg', 'loot', 'dungeons', 'combat', 'dungeon', 'character', 'fantasy', 'quests', 'skills', '觅长生']\n",
      "\n",
      "Most important words of class 'Racing':\n",
      "['cars', 'racing', 'car', 'race', 'speed', 'driving', 'brokkoli', 'ddnet', 'rally', 'jeff']\n",
      "\n",
      "Most important words of class 'Simulation':\n",
      "['simulator', 'realistic', 'simulation', 'physics', 'sandbox', 'building', 'workshop', 'management', 'car', 'idle']\n",
      "\n",
      "Most important words of class 'Sports':\n",
      "['racing', 'skate', 'sports', 'football', 'rally', 'virtual', 'ea', 'vrchat', 'hunting', 'realistic']\n",
      "\n",
      "Most important words of class 'Strategy':\n",
      "['strategy', 'turn', 'units', 'buildings', 'strategic', 'heroes', 'tactical', 'command', '觅长生', 'squad']\n",
      "\n"
     ]
    }
@@ -623,7 +666,18 @@
   "source": [
    "from sklearn.metrics import classification_report\n",
    "\n",
-    "print(classification_report(y_test, y_pred, zero_division=0.0))"
+    "print(classification_report(y_test, y_pred, target_names=y_test.columns, zero_division=0.0))\n",
    "\n",
    "feature_names = vectorizer.get_feature_names_out()\n",
    "class_names = y_test.columns\n",
    "\n",
    "for i, class_name in enumerate(class_names):\n",
    "    coef = multi_target_clf.estimators_[i].coef_.flatten()\n",
    "    # print the top 10 coefficients used\n",
    "    top10 = np.argsort(coef)[-10:]\n",
    "    print(f\"Most important words of class '{class_name}':\")\n",
    "    print([feature_names[j] for j in top10][::-1]) \n",
    "    print()"
   ]
  },
  {
@@ -632,7 +686,15 @@
   "metadata": {},
   "source": [
    "# Optimization\n",
-    "**TODO optimize the model based on the test results**"
+    "- Since our dataset contains multiple languages, it would be beneficial to either train a separate model for each language or to standardize the data before and remove the stop words specific to each language.\n",
    "\n",
    "- Hyperparameter validation should also be performed. For example, in LinearSVC, the C parameter controls the learning rate and could be further optimized.\n",
    "\n",
    "- Instead of a simple train-test split, k-fold cross validation without a fixed random_state should be used to prevent overfitting, better data mixing and more robust results.\n",
    "\n",
    "- Additionally, ensemble learning methods could further improve performance.\n",
    "\n",
    "The biggest limitation of our dataset is the presence of many (especially CJK-) languages but too few entries for each, which is also constrained by our computing resources."
   ]
  },
  {
@@ -641,13 +703,21 @@
   "metadata": {},
   "source": [
    "# Conclusion and outlook\n",
-    "**TODO Write a conclusion and outlook what can be done and where the issues were.**"
+    "To conclude we can say that our model performs reasonably well for the intended application. With a larger dataset, the results would likely improve further. Considering the points mentioned above, it is quite impressive that the model achieves these results using only a small dataset and limited computational resources.\n",
    "\n",
    "Our collaboration as a team worked very smoothly throughout the project. Communication and planning were effective, allowing us to coordinate our tasks efficiently and make steady progress.\n",
    "\n",
    "The main challenge we faced was the limited computational resources available to us. Especially when working with the 10k dataset, training the models for statistical evaluation took a considerable amount of time. To address this, each team member ran different models in parallel on their own machines, with some training processes running for several days.\n",
    "\n",
    "Due to these computational constraints, we decided not to process the full dataset with 80,000 entries. Even though we had access to PCs equipped with the mid to high-range components, the training times were still prohibitively long. As a result, we focused our efforts on the smaller datasets to ensure we could complete the project within a reasonable timeframe.\n",
    "\n",
    "In summary, this project provided us with valuable insights into the challenges and opportunities of machine learning in a real-world context. Despite the limitations we faced, we were able to develop a functioning model and gain practical experience in data preprocessing, model selection, and evaluation. We are proud of what we achieved as a team and look forward to applying the knowledge and skills gained here to future projects.\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
@@ -661,7 +731,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.13.3"
+   "version": "3.13.5"
  }
 },
 "nbformat": 4,
--- a/Machine-Learning.pdf
+++ b/Machine-Learning.pdf
--- a/README.md
+++ b/README.md
@@ -1,18 +1,51 @@
 # Machine Learning Project – Summer Semester 2025
-This project was created as part of the "Machine Learning" course at HTW Saar in the Practical Computer Science study program.
+This project was developed as part of the "Machine Learning" course at HTW Saar in the summer semester 2025 in "Practical Computer Science". The goal is to predict the genres of a game based on its description using various machine learning techniques.
-## Objective
+## Project Overview
-We are developing a Jupyter Notebook that automatically predicts the genre of Steam games based on their descriptions.  
+We use a cleaned Steam dataset containing game descriptions and genre labels as well as many other feature values. The main challenge was to build a robust multi label classification model that can handle multiple genres per game and work with a relatively small dataset due to computational constraints.
-As a data basis, we use a publicly available Steam Games dataset that we found on Kaggle.
+
 Our workflow includes:
 - Data cleaning and preprocessing
 - Feature extraction
 - Multi label genre encoding
 - Model selection and evaluation
 - Optimization suggestions for future work
 ## Dataset
-We use the [Steam Games Dataset from Kaggle](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data).
+The dataset used for this project is available here:  
 [Steam Games Dataset from Kaggle](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data)
 ## Repository
 The full project, including the Jupyter Notebook, code, results and all data set sizes used, can be found on GitHub:  
 [GitHub FlorianSpeicher04/machine-learning](https://github.com/FlorianSpeicher04/machine-learning)
 ## Large File Storage (git-lfs)
 Some files in this repository (such as the datasets) are managed using [git-lfs](https://git-lfs.github.com/).  
 To clone the repository with all large files, please make sure you have git-lfs installed:
 ```sh
 git lfs install
 git clone https://github.com/FlorianSpeicher04/machine-learning
 ```
 ## How to Run
 1. Clone the repository (see above).
 2. Install the required Python packages.
 3. Open `notebook.ipynb` in Jupyter Notebook or VS Code.
 4. Follow the steps in the notebook to reproduce the results (Run All).
 ## Results
 Our model achieves reasonable performance given the dataset size and computational limitations. For more details, see the evaluation and conclusion sections in the notebook.
 ## Contributors
- Maximilian Kany
+- Maximilian Kany 5016118
- Florian Speicher
+- Florian Speicher 5014185
- Tim Wall
+- Tim Wall 5014365
--- a/compare_graph_maker.py
+++ b/compare_graph_maker.py
@@ -2,9 +2,9 @@ import os
 import matplotlib.pyplot as plt
 datasets = {
-    "cleaned": "games_march2025_cleaned",
+    #"cleaned": "games_march2025_cleaned",
-    "cleaned_2k": "games_march2025_cleaned_2k",
+    "cleaned_2k": "games_march2025_cleaned_2k_i3k",
-    "cleaned_10k": "games_march2025_cleaned_10k"
+    "cleaned_10k": "games_march2025_cleaned_10k_i3k"
 }
 # def results
 results = {}
@@ -28,14 +28,14 @@ x = range(len(models))
 plt.figure(figsize=(12,6))
 #plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned")
-plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.5)#, label="cleaned_2k")
+plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.25, label="2k Dataset")
-#plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k")
+plt.bar([i + 0.25 for i in x], [results["cleaned_10k"].get(m,0) for m in models], width=0.25, label="10k Dataset")
 plt.xticks(x, models, rotation=90)
 plt.ylim(0, 1) # min max
 plt.ylabel("Weighted F1-Score")
 plt.title("Model Performance across Datasets")
-#plt.legend()
+plt.legend()
 plt.tight_layout()
 plt.savefig('compare_graph_latest.png')
 plt.show()
--- a/test_script.py
+++ b/test_script.py
@@ -1,133 +0,0 @@
 #### INITIALIZE
 import numpy as np
 import pandas as pd
 from sklearn import set_config
 set_config(transform_output="pandas") # dataframe supremacy
 # load data
 # appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
 dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",")
 print(dataset.head())
 #### DROP UNIQUES
 print("DROP")
 #TODO: wird eh unten beim transformer deleted
 # appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
 #dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email',
 #              'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'],
 #              axis=1, inplace=True)
 #print(dataset.head())
 #### STRUCTURIZE AND STANDARDIZE
 print("STRUCTURE")
 from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import FunctionTransformer
 # desc, genres, tags
 column_transformer = ColumnTransformer([
        # merge all descriptions
        ('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
            ['detailed_description', 'about_the_game', 'short_description']),
        # genre -> actual genre, but very coarse
        # tags -> user defined tags; title num list
        #TODO: decide whether we drop tags
        ('pass', 'passthrough', ['genres']),#, 'tags'
    ],
    verbose_feature_names_out=False
 )
 dataset = column_transformer.fit_transform(dataset)
 print(dataset)
 #### SET MISSING VALUES
 print("SETMISS")
 # Setting missing numeric values to the mean
 dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
 # Setting missing text values to 'Unknown'
 dataset.fillna('', inplace=True)
 # Setting missing values in other columns to NaN
 dataset.dropna(inplace=True)
 ##### STRUCTURIZE GENRES to onehot
 from sklearn.preprocessing import MultiLabelBinarizer
 import ast
 #serialize array
 dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s)) 
 print(dataset['genres']) # in py but not yet onehotenc
 # MultiLabelBinarizer does onehotenc for arrays
 mlb_genres = MultiLabelBinarizer()
 genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
 genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
 genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
 print(genres_df)
 #dataset = pd.concat([dataset, genres_df], axis=1)
 #print(dataset)
 #### convert text to bag of words
 ## Count vs Tfidf vectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 vectorizer = TfidfVectorizer()
 tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
 tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
 print(tfidf_df)
 ##### MODEL
 print("MODEL")
 from sklearn.linear_model import LogisticRegression
 from sklearn.multioutput import MultiOutputClassifier
 from sklearn.metrics import classification_report
 X = tfidf_df
 y = genres_df
 # cleanup datapoints that dont have a target value (all target columns are 0)
 mask = y.sum(axis=1).map(lambda x: x > 0)
 #print((mask == False).sum()) #31 cases with all target columns 0
 X_clean = X[mask]
 y_clean = y[mask]
 # Split dataset
 from sklearn.model_selection import train_test_split
 X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)
 # we want to have multiple possible outputs (multi-label-classficiation) -> multioutputclassifier
 # logi regression is our base system
 # n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is too bad)
 multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)
 # model training
 multi_target_clf.fit(X_train, y_train)
 # predict against test data
 y_pred = multi_target_clf.predict(X_test)
 # print prec, recall, f1 etc
 print(classification_report(y_test, y_pred, zero_division=0.0))
 #print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")
Author	SHA1	Message	Date
FlorianSpeicher	0eda6dcfa8	Add newest html and pdf	2025-08-26 21:57:43 +02:00
Tim	f1cb92c4e0	Doppelt hält besser	2025-08-25 23:25:56 +02:00
Tim	59f0be8058	Add Contributors	2025-08-25 23:24:52 +02:00
Tim	d05e24eaee	minor changes (no html/pdf)	2025-08-25 23:18:39 +02:00
FlorianSpeicher	ad53cc55cb	Rename and add html and pdf	2025-08-25 22:10:52 +02:00
FlorianSpeicher	fba98410f6	Remove Debug Info	2025-08-25 21:45:04 +02:00
FlorianSpeicher	3ccd306e43	Change README	2025-08-25 21:41:43 +02:00
FlorianSpeicher	c3c4ebc9a7	Add Evaluation, Optimization and Conclusion.	2025-08-25 21:41:23 +02:00
FlorianSpeicher	5ad3bbf435	Delete test script	2025-08-25 21:40:43 +02:00
Tim	379fcbf881	graphmaker	2025-08-22 13:12:34 +02:00