Compare commits

...

10 Commits

Author SHA1 Message Date
FlorianSpeicher
0eda6dcfa8 Add newest html and pdf 2025-08-26 21:57:43 +02:00
Tim
f1cb92c4e0 Doppelt hält besser 2025-08-25 23:25:56 +02:00
Tim
59f0be8058 Add Contributors 2025-08-25 23:24:52 +02:00
Tim
d05e24eaee minor changes (no html/pdf) 2025-08-25 23:18:39 +02:00
FlorianSpeicher
ad53cc55cb Rename and add html and pdf 2025-08-25 22:10:52 +02:00
FlorianSpeicher
fba98410f6 Remove Debug Info 2025-08-25 21:45:04 +02:00
FlorianSpeicher
3ccd306e43 Change README 2025-08-25 21:41:43 +02:00
FlorianSpeicher
c3c4ebc9a7 Add Evaluation, Optimization and Conclusion. 2025-08-25 21:41:23 +02:00
FlorianSpeicher
5ad3bbf435 Delete test script 2025-08-25 21:40:43 +02:00
Tim
379fcbf881 graphmaker 2025-08-22 13:12:34 +02:00
6 changed files with 8508 additions and 205 deletions

8333
Machine-Learning.html Normal file

File diff suppressed because one or more lines are too long

View File

@@ -6,6 +6,12 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Machine Learning project in SoSe 2025 at HTW Saar\n", "# Machine Learning project in SoSe 2025 at HTW Saar\n",
"\n",
"## Contributors\n",
"- Maximilian Kany (5016118)\n",
"- Florian Speicher (5014185)\n",
"- Tim Wall (5014365)\n",
"\n",
"## Idea\n", "## Idea\n",
"The goal of this project is predicting the genre(s) of a game/bundle through its given description(s)\n", "The goal of this project is predicting the genre(s) of a game/bundle through its given description(s)\n",
"\n", "\n",
@@ -16,7 +22,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"id": "3116b75f", "id": "3116b75f",
"metadata": { "metadata": {
"jupyter": { "jupyter": {
@@ -90,7 +96,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"id": "d159117377f3633c", "id": "d159117377f3633c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -113,7 +119,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"id": "986fbb31a7ae0d8b", "id": "986fbb31a7ae0d8b",
"metadata": { "metadata": {
"jupyter": { "jupyter": {
@@ -164,13 +170,12 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Adding missing Information\n", "### Adding missing Information\n",
"Some Games might not have any descriptions. For these we Input an Empty String\n", "Some Games might not have any descriptions. For these we Input an Empty String."
"**TODO: check if dropna and fillna numeric_only is needed, as we dont have any numbers**"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"id": "44239f6b7fd23cde", "id": "44239f6b7fd23cde",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -197,7 +202,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"id": "ebc5a24e9bc87fdd", "id": "ebc5a24e9bc87fdd",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -233,7 +238,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"id": "d2c3527a5fc876bf", "id": "d2c3527a5fc876bf",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -280,12 +285,12 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Structurizing Text\n", "### Structurizing Text\n",
"If we want our Model to be able to use text as an input, we have to vectorize the text. TF-IDF (Inverse Document Frequency) is an easy way of transforming each word into a feature with a 0 to 1 value. **TODO: filter out stopwords**" "If we want our Model to be able to use text as an input, we have to vectorize the text. TF-IDF (Inverse Document Frequency) is an easy way of transforming each word into a feature with a 0 to 1 value."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 7,
"id": "4e8b407c", "id": "4e8b407c",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -293,12 +298,12 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
" 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n", " 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n",
"0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "1 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "2 0.0 0.0 0.0 0.162349 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "3 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n", "4 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
"\n", "\n",
" 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n", " 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
@@ -307,14 +312,14 @@
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n", "\n",
"[5 rows x 29351 columns]\n" "[5 rows x 29056 columns]\n"
] ]
} }
], ],
"source": [ "source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n", "\n",
"vectorizer = TfidfVectorizer()\n", "vectorizer = TfidfVectorizer(stop_words='english')\n",
"tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix, not pandas df\n", "tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix, not pandas df\n",
"tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())\n", "tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())\n",
"print(tfidf_df.head())" "print(tfidf_df.head())"
@@ -330,7 +335,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 8,
"id": "86d9da42f4df8e49", "id": "86d9da42f4df8e49",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -353,7 +358,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 9,
"id": "e1bc73d4", "id": "e1bc73d4",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -385,7 +390,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 10,
"id": "4919bf1b37d171a7", "id": "4919bf1b37d171a7",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -417,7 +422,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 11,
"id": "cfbf3787", "id": "cfbf3787",
"metadata": { "metadata": {
"jupyter": { "jupyter": {
@@ -441,17 +446,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 12,
"id": "0b0a46a4", "id": "0b0a46a4",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"99" "82"
] ]
}, },
"execution_count": null, "execution_count": 12,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -470,7 +475,6 @@
"\n", "\n",
"# preparation of X\n", "# preparation of X\n",
"del tfidf_df\n", "del tfidf_df\n",
"del vectorizer\n",
"del tfidf_matrix\n", "del tfidf_matrix\n",
"\n", "\n",
"# Initial Dataset\n", "# Initial Dataset\n",
@@ -557,16 +561,10 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 13,
"id": "8c1d72c4532bd509", "id": "8c1d72c4532bd509",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stderr",
"output_type": "stream",
"text": []
}
],
"source": [ "source": [
"from sklearn.svm import LinearSVC\n", "from sklearn.svm import LinearSVC\n",
"from sklearn.multioutput import MultiOutputClassifier\n", "from sklearn.multioutput import MultiOutputClassifier\n",
@@ -584,12 +582,21 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Evaluation\n", "# Evaluation\n",
"**TODO Test the Model with the test data**" "We evaluate our model by comparing the test data with the predicted data. We are using the worst case scenario by setting zero_division=0.0 in the classification report. This means that if a metric cannot be calculated due to division by zero, it is set to 0.0. Setting this parameter to 1.0 (best case) does not significantly change the results.\n",
"\n",
"Our approach involves training one model per genre, resulting in a total of 12 models for the 2k dataset. Each model predicts a specific genre, and the combined results of all models are shown at the bottom of the report. The input features are represented by X, and the output labels by y.\n",
"\n",
"Key metrics such as precision and recall are calculated for each class. These metrics indicate whether all classes are recognized and how accurate the predictions are. Notably, two classes achieve perfect 1.0 precision. For some reason, the Early Access class performs particularly poorly. The F1 score is also included in the evaluation, as it provides a balanced measure of precision and recall. The support column indicates the number of samples for each class.\n",
"\n",
"It is noteworthy that some of the top 10 words influencing the decision process are related to brands or game names, such as \"vrchat\" in Early Access and Sports, \"vermintide\" in Indie, in \"ea\" in Sports, since the description was not cleaned of developer, publisher and game names. Some words, like \"brokkoli\" in Racing, are not obviously related to the genre, which may indicate overfitting or (much more likely) the presence of only a few fitting data points in the dataset. Generally, all classes with less than 100 datapoints seem to have a very low recall but very high precision.\n",
"\n",
"A model is considered very good with an F1 score above 0.8, and good with a score above 0.7. In our case, the F1 micro and macro scores are 0.69 and 0.54, which means our model performs decent up to good. The low macro scores are mainly due to problematic classes, but overall, the weighted average and samples average are quite acceptable for a dataset of this size.\n",
"\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 14,
"id": "e2ebea6945193e07", "id": "e2ebea6945193e07",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -597,25 +604,61 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
" precision recall f1-score support\n", " precision recall f1-score support\n",
"\n", "\n",
" 0 0.84 0.86 0.85 300\n", " Action 0.86 0.87 0.87 300\n",
" 1 0.74 0.63 0.68 216\n", " Adventure 0.74 0.66 0.70 216\n",
" 2 0.77 0.31 0.45 86\n", " Casual 0.79 0.22 0.35 86\n",
" 3 0.50 0.04 0.08 46\n", " Early Access 0.50 0.02 0.04 46\n",
" 4 0.69 0.33 0.44 83\n", " Free To Play 0.79 0.28 0.41 83\n",
" 5 0.79 0.80 0.79 245\n", " Indie 0.77 0.81 0.79 245\n",
" 6 0.69 0.26 0.38 42\n", "Massively Multiplayer 0.89 0.19 0.31 42\n",
" 7 0.74 0.62 0.68 127\n", " RPG 0.80 0.55 0.65 127\n",
" 8 1.00 0.67 0.80 12\n", " Racing 1.00 0.58 0.74 12\n",
" 9 0.80 0.57 0.67 127\n", " Simulation 0.86 0.50 0.64 127\n",
" 10 1.00 0.50 0.67 14\n", " Sports 1.00 0.29 0.44 14\n",
" 11 0.79 0.46 0.58 106\n", " Strategy 0.80 0.41 0.54 106\n",
"\n", "\n",
" micro avg 0.79 0.62 0.69 1404\n", " micro avg 0.81 0.60 0.69 1404\n",
" macro avg 0.78 0.51 0.59 1404\n", " macro avg 0.82 0.45 0.54 1404\n",
"weighted avg 0.77 0.62 0.67 1404\n", " weighted avg 0.80 0.60 0.65 1404\n",
" samples avg 0.80 0.68 0.70 1404\n", " samples avg 0.81 0.66 0.69 1404\n",
"\n",
"Most important words of class 'Action':\n",
"['action', 'weapons', 'shooter', 'fighting', 'fight', 'weapon', 'players', 'aim', 'gun', 'intense']\n",
"\n",
"Most important words of class 'Adventure':\n",
"['adventure', 'explore', 'puzzles', 'smite', 'far', 'stories', 'remake', 'hunting', 'don', 'secrets']\n",
"\n",
"Most important words of class 'Casual':\n",
"['puzzle', 'color', 'ball', 'smite', 'poker', 'click', 'communication', 'idle', 'cats', 'fun']\n",
"\n",
"Most important words of class 'Early Access':\n",
"['early', 'pals', 'backrooms', 'automation', 'rotwood', 'access', 'design', 'vrchat', 'nephelym', 'idleon']\n",
"\n",
"Most important words of class 'Free To Play':\n",
"['free', 'royale', 'mmo', 'pvp', 'arena', 'mmorpg', 'idle', 'cats', 'millions', 'team']\n",
"\n",
"Most important words of class 'Indie':\n",
"['game', 'horror', 'building', 'different', 'vermintide', 'generated', 'roguelike', 'better', 'soundtrack', 'procedurally']\n",
"\n",
"Most important words of class 'Massively Multiplayer':\n",
"['royale', 'mmorpg', 'players', 'mmo', 'pvp', 'ball', 'smite', 'scp', 'temtem', 'join']\n",
"\n",
"Most important words of class 'RPG':\n",
"['rpg', 'loot', 'dungeons', 'combat', 'dungeon', 'character', 'fantasy', 'quests', 'skills', '觅长生']\n",
"\n",
"Most important words of class 'Racing':\n",
"['cars', 'racing', 'car', 'race', 'speed', 'driving', 'brokkoli', 'ddnet', 'rally', 'jeff']\n",
"\n",
"Most important words of class 'Simulation':\n",
"['simulator', 'realistic', 'simulation', 'physics', 'sandbox', 'building', 'workshop', 'management', 'car', 'idle']\n",
"\n",
"Most important words of class 'Sports':\n",
"['racing', 'skate', 'sports', 'football', 'rally', 'virtual', 'ea', 'vrchat', 'hunting', 'realistic']\n",
"\n",
"Most important words of class 'Strategy':\n",
"['strategy', 'turn', 'units', 'buildings', 'strategic', 'heroes', 'tactical', 'command', '觅长生', 'squad']\n",
"\n" "\n"
] ]
} }
@@ -623,7 +666,18 @@
"source": [ "source": [
"from sklearn.metrics import classification_report\n", "from sklearn.metrics import classification_report\n",
"\n", "\n",
"print(classification_report(y_test, y_pred, zero_division=0.0))" "print(classification_report(y_test, y_pred, target_names=y_test.columns, zero_division=0.0))\n",
"\n",
"feature_names = vectorizer.get_feature_names_out()\n",
"class_names = y_test.columns\n",
"\n",
"for i, class_name in enumerate(class_names):\n",
" coef = multi_target_clf.estimators_[i].coef_.flatten()\n",
" # print the top 10 coefficients used\n",
" top10 = np.argsort(coef)[-10:]\n",
" print(f\"Most important words of class '{class_name}':\")\n",
" print([feature_names[j] for j in top10][::-1]) \n",
" print()"
] ]
}, },
{ {
@@ -632,7 +686,15 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Optimization\n", "# Optimization\n",
"**TODO optimize the model based on the test results**" "- Since our dataset contains multiple languages, it would be beneficial to either train a separate model for each language or to standardize the data before and remove the stop words specific to each language.\n",
"\n",
"- Hyperparameter validation should also be performed. For example, in LinearSVC, the C parameter controls the learning rate and could be further optimized.\n",
"\n",
"- Instead of a simple train-test split, k-fold cross validation without a fixed random_state should be used to prevent overfitting, better data mixing and more robust results.\n",
"\n",
"- Additionally, ensemble learning methods could further improve performance.\n",
"\n",
"The biggest limitation of our dataset is the presence of many (especially CJK-) languages but too few entries for each, which is also constrained by our computing resources."
] ]
}, },
{ {
@@ -641,13 +703,21 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Conclusion and outlook\n", "# Conclusion and outlook\n",
"**TODO Write a conclusion and outlook what can be done and where the issues were.**" "To conclude we can say that our model performs reasonably well for the intended application. With a larger dataset, the results would likely improve further. Considering the points mentioned above, it is quite impressive that the model achieves these results using only a small dataset and limited computational resources.\n",
"\n",
"Our collaboration as a team worked very smoothly throughout the project. Communication and planning were effective, allowing us to coordinate our tasks efficiently and make steady progress.\n",
"\n",
"The main challenge we faced was the limited computational resources available to us. Especially when working with the 10k dataset, training the models for statistical evaluation took a considerable amount of time. To address this, each team member ran different models in parallel on their own machines, with some training processes running for several days.\n",
"\n",
"Due to these computational constraints, we decided not to process the full dataset with 80,000 entries. Even though we had access to PCs equipped with the mid to high-range components, the training times were still prohibitively long. As a result, we focused our efforts on the smaller datasets to ensure we could complete the project within a reasonable timeframe.\n",
"\n",
"In summary, this project provided us with valuable insights into the challenges and opportunities of machine learning in a real-world context. Despite the limitations we faced, we were able to develop a functioning model and gain practical experience in data preprocessing, model selection, and evaluation. We are proud of what we achieved as a team and look forward to applying the knowledge and skills gained here to future projects.\n"
] ]
} }
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "base",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@@ -661,7 +731,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.13.3" "version": "3.13.5"
} }
}, },
"nbformat": 4, "nbformat": 4,

BIN
Machine-Learning.pdf Normal file

Binary file not shown.

View File

@@ -1,18 +1,51 @@
# Machine Learning Project Summer Semester 2025 # Machine Learning Project Summer Semester 2025
This project was created as part of the "Machine Learning" course at HTW Saar in the Practical Computer Science study program. This project was developed as part of the "Machine Learning" course at HTW Saar in the summer semester 2025 in "Practical Computer Science". The goal is to predict the genres of a game based on its description using various machine learning techniques.
## Objective ## Project Overview
We are developing a Jupyter Notebook that automatically predicts the genre of Steam games based on their descriptions. We use a cleaned Steam dataset containing game descriptions and genre labels as well as many other feature values. The main challenge was to build a robust multi label classification model that can handle multiple genres per game and work with a relatively small dataset due to computational constraints.
As a data basis, we use a publicly available Steam Games dataset that we found on Kaggle.
Our workflow includes:
- Data cleaning and preprocessing
- Feature extraction
- Multi label genre encoding
- Model selection and evaluation
- Optimization suggestions for future work
## Dataset ## Dataset
We use the [Steam Games Dataset from Kaggle](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data). The dataset used for this project is available here:
[Steam Games Dataset from Kaggle](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data)
## Repository
The full project, including the Jupyter Notebook, code, results and all data set sizes used, can be found on GitHub:
[GitHub FlorianSpeicher04/machine-learning](https://github.com/FlorianSpeicher04/machine-learning)
## Large File Storage (git-lfs)
Some files in this repository (such as the datasets) are managed using [git-lfs](https://git-lfs.github.com/).
To clone the repository with all large files, please make sure you have git-lfs installed:
```sh
git lfs install
git clone https://github.com/FlorianSpeicher04/machine-learning
```
## How to Run
1. Clone the repository (see above).
2. Install the required Python packages.
3. Open `notebook.ipynb` in Jupyter Notebook or VS Code.
4. Follow the steps in the notebook to reproduce the results (Run All).
## Results
Our model achieves reasonable performance given the dataset size and computational limitations. For more details, see the evaluation and conclusion sections in the notebook.
## Contributors ## Contributors
- Maximilian Kany - Maximilian Kany 5016118
- Florian Speicher - Florian Speicher 5014185
- Tim Wall - Tim Wall 5014365

View File

@@ -2,9 +2,9 @@ import os
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
datasets = { datasets = {
"cleaned": "games_march2025_cleaned", #"cleaned": "games_march2025_cleaned",
"cleaned_2k": "games_march2025_cleaned_2k", "cleaned_2k": "games_march2025_cleaned_2k_i3k",
"cleaned_10k": "games_march2025_cleaned_10k" "cleaned_10k": "games_march2025_cleaned_10k_i3k"
} }
# def results # def results
results = {} results = {}
@@ -28,14 +28,14 @@ x = range(len(models))
plt.figure(figsize=(12,6)) plt.figure(figsize=(12,6))
#plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned") #plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned")
plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.5)#, label="cleaned_2k") plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.25, label="2k Dataset")
#plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k") plt.bar([i + 0.25 for i in x], [results["cleaned_10k"].get(m,0) for m in models], width=0.25, label="10k Dataset")
plt.xticks(x, models, rotation=90) plt.xticks(x, models, rotation=90)
plt.ylim(0, 1) # min max plt.ylim(0, 1) # min max
plt.ylabel("Weighted F1-Score") plt.ylabel("Weighted F1-Score")
plt.title("Model Performance across Datasets") plt.title("Model Performance across Datasets")
#plt.legend() plt.legend()
plt.tight_layout() plt.tight_layout()
plt.savefig('compare_graph_latest.png') plt.savefig('compare_graph_latest.png')
plt.show() plt.show()

View File

@@ -1,133 +0,0 @@
#### INITIALIZE
import numpy as np
import pandas as pd
from sklearn import set_config
set_config(transform_output="pandas") # dataframe supremacy
# load data
# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",")
print(dataset.head())
#### DROP UNIQUES
print("DROP")
#TODO: wird eh unten beim transformer deleted
# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
#dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email',
# 'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'],
# axis=1, inplace=True)
#print(dataset.head())
#### STRUCTURIZE AND STANDARDIZE
print("STRUCTURE")
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
# desc, genres, tags
column_transformer = ColumnTransformer([
# merge all descriptions
('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
['detailed_description', 'about_the_game', 'short_description']),
# genre -> actual genre, but very coarse
# tags -> user defined tags; title num list
#TODO: decide whether we drop tags
('pass', 'passthrough', ['genres']),#, 'tags'
],
verbose_feature_names_out=False
)
dataset = column_transformer.fit_transform(dataset)
print(dataset)
#### SET MISSING VALUES
print("SETMISS")
# Setting missing numeric values to the mean
dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
# Setting missing text values to 'Unknown'
dataset.fillna('', inplace=True)
# Setting missing values in other columns to NaN
dataset.dropna(inplace=True)
##### STRUCTURIZE GENRES to onehot
from sklearn.preprocessing import MultiLabelBinarizer
import ast
#serialize array
dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))
print(dataset['genres']) # in py but not yet onehotenc
# MultiLabelBinarizer does onehotenc for arrays
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
print(genres_df)
#dataset = pd.concat([dataset, genres_df], axis=1)
#print(dataset)
#### convert text to bag of words
## Count vs Tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(tfidf_df)
##### MODEL
print("MODEL")
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
X = tfidf_df
y = genres_df
# cleanup datapoints that dont have a target value (all target columns are 0)
mask = y.sum(axis=1).map(lambda x: x > 0)
#print((mask == False).sum()) #31 cases with all target columns 0
X_clean = X[mask]
y_clean = y[mask]
# Split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)
# we want to have multiple possible outputs (multi-label-classficiation) -> multioutputclassifier
# logi regression is our base system
# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is too bad)
multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)
# model training
multi_target_clf.fit(X_train, y_train)
# predict against test data
y_pred = multi_target_clf.predict(X_test)
# print prec, recall, f1 etc
print(classification_report(y_test, y_pred, zero_division=0.0))
#print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")