Compare commits
10 Commits
cf92ef246a
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0eda6dcfa8 | ||
|
|
f1cb92c4e0 | ||
|
|
59f0be8058 | ||
|
|
d05e24eaee | ||
|
|
ad53cc55cb | ||
|
|
fba98410f6 | ||
|
|
3ccd306e43 | ||
|
|
c3c4ebc9a7 | ||
|
|
5ad3bbf435 | ||
|
|
379fcbf881 |
8333
Machine-Learning.html
Normal file
8333
Machine-Learning.html
Normal file
File diff suppressed because one or more lines are too long
@@ -6,6 +6,12 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Machine Learning project in SoSe 2025 at HTW Saar\n",
|
||||
"\n",
|
||||
"## Contributors\n",
|
||||
"- Maximilian Kany (5016118)\n",
|
||||
"- Florian Speicher (5014185)\n",
|
||||
"- Tim Wall (5014365)\n",
|
||||
"\n",
|
||||
"## Idea\n",
|
||||
"The goal of this project is predicting the genre(s) of a game/bundle through its given description(s)\n",
|
||||
"\n",
|
||||
@@ -16,7 +22,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"id": "3116b75f",
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
@@ -90,7 +96,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"id": "d159117377f3633c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -113,7 +119,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"id": "986fbb31a7ae0d8b",
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
@@ -164,13 +170,12 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Adding missing Information\n",
|
||||
"Some Games might not have any descriptions. For these we Input an Empty String\n",
|
||||
"**TODO: check if dropna and fillna numeric_only is needed, as we dont have any numbers**"
|
||||
"Some Games might not have any descriptions. For these we Input an Empty String."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"id": "44239f6b7fd23cde",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -197,7 +202,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"id": "ebc5a24e9bc87fdd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -233,7 +238,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"id": "d2c3527a5fc876bf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -280,12 +285,12 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Structurizing Text\n",
|
||||
"If we want our Model to be able to use text as an input, we have to vectorize the text. TF-IDF (Inverse Document Frequency) is an easy way of transforming each word into a feature with a 0 to 1 value. **TODO: filter out stopwords**"
|
||||
"If we want our Model to be able to use text as an input, we have to vectorize the text. TF-IDF (Inverse Document Frequency) is an easy way of transforming each word into a feature with a 0 to 1 value."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 7,
|
||||
"id": "4e8b407c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -293,12 +298,12 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n",
|
||||
"0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"1 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"2 0.0 0.0 0.0 0.14649 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"3 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"4 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
" 00 000 000km 000th 00am 00f 00i 00p 00v 01 ... 이터널 이터널리턴 \\\n",
|
||||
"0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"1 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"2 0.0 0.0 0.0 0.162349 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"3 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"4 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
||||
"\n",
|
||||
" 이현준 정대찬 중입니다 철권 토탈워 페르소나 한국어 한글을 \n",
|
||||
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
@@ -307,14 +312,14 @@
|
||||
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 29351 columns]\n"
|
||||
"[5 rows x 29056 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"vectorizer = TfidfVectorizer()\n",
|
||||
"vectorizer = TfidfVectorizer(stop_words='english')\n",
|
||||
"tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix, not pandas df\n",
|
||||
"tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())\n",
|
||||
"print(tfidf_df.head())"
|
||||
@@ -330,7 +335,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"id": "86d9da42f4df8e49",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -353,7 +358,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"id": "e1bc73d4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -385,7 +390,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 10,
|
||||
"id": "4919bf1b37d171a7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -417,7 +422,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 11,
|
||||
"id": "cfbf3787",
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
@@ -441,17 +446,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 12,
|
||||
"id": "0b0a46a4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"99"
|
||||
"82"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -470,7 +475,6 @@
|
||||
"\n",
|
||||
"# preparation of X\n",
|
||||
"del tfidf_df\n",
|
||||
"del vectorizer\n",
|
||||
"del tfidf_matrix\n",
|
||||
"\n",
|
||||
"# Initial Dataset\n",
|
||||
@@ -557,16 +561,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 13,
|
||||
"id": "8c1d72c4532bd509",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": []
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.svm import LinearSVC\n",
|
||||
"from sklearn.multioutput import MultiOutputClassifier\n",
|
||||
@@ -584,12 +582,21 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Evaluation\n",
|
||||
"**TODO Test the Model with the test data**"
|
||||
"We evaluate our model by comparing the test data with the predicted data. We are using the worst case scenario by setting zero_division=0.0 in the classification report. This means that if a metric cannot be calculated due to division by zero, it is set to 0.0. Setting this parameter to 1.0 (best case) does not significantly change the results.\n",
|
||||
"\n",
|
||||
"Our approach involves training one model per genre, resulting in a total of 12 models for the 2k dataset. Each model predicts a specific genre, and the combined results of all models are shown at the bottom of the report. The input features are represented by X, and the output labels by y.\n",
|
||||
"\n",
|
||||
"Key metrics such as precision and recall are calculated for each class. These metrics indicate whether all classes are recognized and how accurate the predictions are. Notably, two classes achieve perfect 1.0 precision. For some reason, the Early Access class performs particularly poorly. The F1 score is also included in the evaluation, as it provides a balanced measure of precision and recall. The support column indicates the number of samples for each class.\n",
|
||||
"\n",
|
||||
"It is noteworthy that some of the top 10 words influencing the decision process are related to brands or game names, such as \"vrchat\" in Early Access and Sports, \"vermintide\" in Indie, in \"ea\" in Sports, since the description was not cleaned of developer, publisher and game names. Some words, like \"brokkoli\" in Racing, are not obviously related to the genre, which may indicate overfitting or (much more likely) the presence of only a few fitting data points in the dataset. Generally, all classes with less than 100 datapoints seem to have a very low recall but very high precision.\n",
|
||||
"\n",
|
||||
"A model is considered very good with an F1 score above 0.8, and good with a score above 0.7. In our case, the F1 micro and macro scores are 0.69 and 0.54, which means our model performs decent up to good. The low macro scores are mainly due to problematic classes, but overall, the weighted average and samples average are quite acceptable for a dataset of this size.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 14,
|
||||
"id": "e2ebea6945193e07",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -597,25 +604,61 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" precision recall f1-score support\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.84 0.86 0.85 300\n",
|
||||
" 1 0.74 0.63 0.68 216\n",
|
||||
" 2 0.77 0.31 0.45 86\n",
|
||||
" 3 0.50 0.04 0.08 46\n",
|
||||
" 4 0.69 0.33 0.44 83\n",
|
||||
" 5 0.79 0.80 0.79 245\n",
|
||||
" 6 0.69 0.26 0.38 42\n",
|
||||
" 7 0.74 0.62 0.68 127\n",
|
||||
" 8 1.00 0.67 0.80 12\n",
|
||||
" 9 0.80 0.57 0.67 127\n",
|
||||
" 10 1.00 0.50 0.67 14\n",
|
||||
" 11 0.79 0.46 0.58 106\n",
|
||||
" Action 0.86 0.87 0.87 300\n",
|
||||
" Adventure 0.74 0.66 0.70 216\n",
|
||||
" Casual 0.79 0.22 0.35 86\n",
|
||||
" Early Access 0.50 0.02 0.04 46\n",
|
||||
" Free To Play 0.79 0.28 0.41 83\n",
|
||||
" Indie 0.77 0.81 0.79 245\n",
|
||||
"Massively Multiplayer 0.89 0.19 0.31 42\n",
|
||||
" RPG 0.80 0.55 0.65 127\n",
|
||||
" Racing 1.00 0.58 0.74 12\n",
|
||||
" Simulation 0.86 0.50 0.64 127\n",
|
||||
" Sports 1.00 0.29 0.44 14\n",
|
||||
" Strategy 0.80 0.41 0.54 106\n",
|
||||
"\n",
|
||||
" micro avg 0.79 0.62 0.69 1404\n",
|
||||
" macro avg 0.78 0.51 0.59 1404\n",
|
||||
"weighted avg 0.77 0.62 0.67 1404\n",
|
||||
" samples avg 0.80 0.68 0.70 1404\n",
|
||||
" micro avg 0.81 0.60 0.69 1404\n",
|
||||
" macro avg 0.82 0.45 0.54 1404\n",
|
||||
" weighted avg 0.80 0.60 0.65 1404\n",
|
||||
" samples avg 0.81 0.66 0.69 1404\n",
|
||||
"\n",
|
||||
"Most important words of class 'Action':\n",
|
||||
"['action', 'weapons', 'shooter', 'fighting', 'fight', 'weapon', 'players', 'aim', 'gun', 'intense']\n",
|
||||
"\n",
|
||||
"Most important words of class 'Adventure':\n",
|
||||
"['adventure', 'explore', 'puzzles', 'smite', 'far', 'stories', 'remake', 'hunting', 'don', 'secrets']\n",
|
||||
"\n",
|
||||
"Most important words of class 'Casual':\n",
|
||||
"['puzzle', 'color', 'ball', 'smite', 'poker', 'click', 'communication', 'idle', 'cats', 'fun']\n",
|
||||
"\n",
|
||||
"Most important words of class 'Early Access':\n",
|
||||
"['early', 'pals', 'backrooms', 'automation', 'rotwood', 'access', 'design', 'vrchat', 'nephelym', 'idleon']\n",
|
||||
"\n",
|
||||
"Most important words of class 'Free To Play':\n",
|
||||
"['free', 'royale', 'mmo', 'pvp', 'arena', 'mmorpg', 'idle', 'cats', 'millions', 'team']\n",
|
||||
"\n",
|
||||
"Most important words of class 'Indie':\n",
|
||||
"['game', 'horror', 'building', 'different', 'vermintide', 'generated', 'roguelike', 'better', 'soundtrack', 'procedurally']\n",
|
||||
"\n",
|
||||
"Most important words of class 'Massively Multiplayer':\n",
|
||||
"['royale', 'mmorpg', 'players', 'mmo', 'pvp', 'ball', 'smite', 'scp', 'temtem', 'join']\n",
|
||||
"\n",
|
||||
"Most important words of class 'RPG':\n",
|
||||
"['rpg', 'loot', 'dungeons', 'combat', 'dungeon', 'character', 'fantasy', 'quests', 'skills', '觅长生']\n",
|
||||
"\n",
|
||||
"Most important words of class 'Racing':\n",
|
||||
"['cars', 'racing', 'car', 'race', 'speed', 'driving', 'brokkoli', 'ddnet', 'rally', 'jeff']\n",
|
||||
"\n",
|
||||
"Most important words of class 'Simulation':\n",
|
||||
"['simulator', 'realistic', 'simulation', 'physics', 'sandbox', 'building', 'workshop', 'management', 'car', 'idle']\n",
|
||||
"\n",
|
||||
"Most important words of class 'Sports':\n",
|
||||
"['racing', 'skate', 'sports', 'football', 'rally', 'virtual', 'ea', 'vrchat', 'hunting', 'realistic']\n",
|
||||
"\n",
|
||||
"Most important words of class 'Strategy':\n",
|
||||
"['strategy', 'turn', 'units', 'buildings', 'strategic', 'heroes', 'tactical', 'command', '觅长生', 'squad']\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
@@ -623,7 +666,18 @@
|
||||
"source": [
|
||||
"from sklearn.metrics import classification_report\n",
|
||||
"\n",
|
||||
"print(classification_report(y_test, y_pred, zero_division=0.0))"
|
||||
"print(classification_report(y_test, y_pred, target_names=y_test.columns, zero_division=0.0))\n",
|
||||
"\n",
|
||||
"feature_names = vectorizer.get_feature_names_out()\n",
|
||||
"class_names = y_test.columns\n",
|
||||
"\n",
|
||||
"for i, class_name in enumerate(class_names):\n",
|
||||
" coef = multi_target_clf.estimators_[i].coef_.flatten()\n",
|
||||
" # print the top 10 coefficients used\n",
|
||||
" top10 = np.argsort(coef)[-10:]\n",
|
||||
" print(f\"Most important words of class '{class_name}':\")\n",
|
||||
" print([feature_names[j] for j in top10][::-1]) \n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -632,7 +686,15 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Optimization\n",
|
||||
"**TODO optimize the model based on the test results**"
|
||||
"- Since our dataset contains multiple languages, it would be beneficial to either train a separate model for each language or to standardize the data before and remove the stop words specific to each language.\n",
|
||||
"\n",
|
||||
"- Hyperparameter validation should also be performed. For example, in LinearSVC, the C parameter controls the learning rate and could be further optimized.\n",
|
||||
"\n",
|
||||
"- Instead of a simple train-test split, k-fold cross validation without a fixed random_state should be used to prevent overfitting, better data mixing and more robust results.\n",
|
||||
"\n",
|
||||
"- Additionally, ensemble learning methods could further improve performance.\n",
|
||||
"\n",
|
||||
"The biggest limitation of our dataset is the presence of many (especially CJK-) languages but too few entries for each, which is also constrained by our computing resources."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -641,13 +703,21 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Conclusion and outlook\n",
|
||||
"**TODO Write a conclusion and outlook what can be done and where the issues were.**"
|
||||
"To conclude we can say that our model performs reasonably well for the intended application. With a larger dataset, the results would likely improve further. Considering the points mentioned above, it is quite impressive that the model achieves these results using only a small dataset and limited computational resources.\n",
|
||||
"\n",
|
||||
"Our collaboration as a team worked very smoothly throughout the project. Communication and planning were effective, allowing us to coordinate our tasks efficiently and make steady progress.\n",
|
||||
"\n",
|
||||
"The main challenge we faced was the limited computational resources available to us. Especially when working with the 10k dataset, training the models for statistical evaluation took a considerable amount of time. To address this, each team member ran different models in parallel on their own machines, with some training processes running for several days.\n",
|
||||
"\n",
|
||||
"Due to these computational constraints, we decided not to process the full dataset with 80,000 entries. Even though we had access to PCs equipped with the mid to high-range components, the training times were still prohibitively long. As a result, we focused our efforts on the smaller datasets to ensure we could complete the project within a reasonable timeframe.\n",
|
||||
"\n",
|
||||
"In summary, this project provided us with valuable insights into the challenges and opportunities of machine learning in a real-world context. Despite the limitations we faced, we were able to develop a functioning model and gain practical experience in data preprocessing, model selection, and evaluation. We are proud of what we achieved as a team and look forward to applying the knowledge and skills gained here to future projects.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@@ -661,7 +731,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.3"
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
BIN
Machine-Learning.pdf
Normal file
BIN
Machine-Learning.pdf
Normal file
Binary file not shown.
49
README.md
49
README.md
@@ -1,18 +1,51 @@
|
||||
# Machine Learning Project – Summer Semester 2025
|
||||
|
||||
This project was created as part of the "Machine Learning" course at HTW Saar in the Practical Computer Science study program.
|
||||
This project was developed as part of the "Machine Learning" course at HTW Saar in the summer semester 2025 in "Practical Computer Science". The goal is to predict the genres of a game based on its description using various machine learning techniques.
|
||||
|
||||
## Objective
|
||||
## Project Overview
|
||||
|
||||
We are developing a Jupyter Notebook that automatically predicts the genre of Steam games based on their descriptions.
|
||||
As a data basis, we use a publicly available Steam Games dataset that we found on Kaggle.
|
||||
We use a cleaned Steam dataset containing game descriptions and genre labels as well as many other feature values. The main challenge was to build a robust multi label classification model that can handle multiple genres per game and work with a relatively small dataset due to computational constraints.
|
||||
|
||||
Our workflow includes:
|
||||
- Data cleaning and preprocessing
|
||||
- Feature extraction
|
||||
- Multi label genre encoding
|
||||
- Model selection and evaluation
|
||||
- Optimization suggestions for future work
|
||||
|
||||
## Dataset
|
||||
|
||||
We use the [Steam Games Dataset from Kaggle](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data).
|
||||
The dataset used for this project is available here:
|
||||
[Steam Games Dataset from Kaggle](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data)
|
||||
|
||||
## Repository
|
||||
|
||||
The full project, including the Jupyter Notebook, code, results and all data set sizes used, can be found on GitHub:
|
||||
[GitHub FlorianSpeicher04/machine-learning](https://github.com/FlorianSpeicher04/machine-learning)
|
||||
|
||||
## Large File Storage (git-lfs)
|
||||
|
||||
Some files in this repository (such as the datasets) are managed using [git-lfs](https://git-lfs.github.com/).
|
||||
To clone the repository with all large files, please make sure you have git-lfs installed:
|
||||
|
||||
```sh
|
||||
git lfs install
|
||||
git clone https://github.com/FlorianSpeicher04/machine-learning
|
||||
```
|
||||
|
||||
## How to Run
|
||||
|
||||
1. Clone the repository (see above).
|
||||
2. Install the required Python packages.
|
||||
3. Open `notebook.ipynb` in Jupyter Notebook or VS Code.
|
||||
4. Follow the steps in the notebook to reproduce the results (Run All).
|
||||
|
||||
## Results
|
||||
|
||||
Our model achieves reasonable performance given the dataset size and computational limitations. For more details, see the evaluation and conclusion sections in the notebook.
|
||||
|
||||
## Contributors
|
||||
|
||||
- Maximilian Kany
|
||||
- Florian Speicher
|
||||
- Tim Wall
|
||||
- Maximilian Kany 5016118
|
||||
- Florian Speicher 5014185
|
||||
- Tim Wall 5014365
|
||||
|
||||
@@ -2,9 +2,9 @@ import os
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
datasets = {
|
||||
"cleaned": "games_march2025_cleaned",
|
||||
"cleaned_2k": "games_march2025_cleaned_2k",
|
||||
"cleaned_10k": "games_march2025_cleaned_10k"
|
||||
#"cleaned": "games_march2025_cleaned",
|
||||
"cleaned_2k": "games_march2025_cleaned_2k_i3k",
|
||||
"cleaned_10k": "games_march2025_cleaned_10k_i3k"
|
||||
}
|
||||
# def results
|
||||
results = {}
|
||||
@@ -28,14 +28,14 @@ x = range(len(models))
|
||||
|
||||
plt.figure(figsize=(12,6))
|
||||
#plt.bar([i - 0.25 for i in x], [results["cleaned"][m] for m in models], width=0.25, label="cleaned")
|
||||
plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.5)#, label="cleaned_2k")
|
||||
#plt.bar([i + 0.25 for i in x], [results["cleaned_10k"][m] for m in models], width=0.25, label="cleaned_10k")
|
||||
plt.bar(x, [results["cleaned_2k"][m] for m in models], width=0.25, label="2k Dataset")
|
||||
plt.bar([i + 0.25 for i in x], [results["cleaned_10k"].get(m,0) for m in models], width=0.25, label="10k Dataset")
|
||||
|
||||
plt.xticks(x, models, rotation=90)
|
||||
plt.ylim(0, 1) # min max
|
||||
plt.ylabel("Weighted F1-Score")
|
||||
plt.title("Model Performance across Datasets")
|
||||
#plt.legend()
|
||||
plt.legend()
|
||||
plt.tight_layout()
|
||||
plt.savefig('compare_graph_latest.png')
|
||||
plt.show()
|
||||
|
||||
133
test_script.py
133
test_script.py
@@ -1,133 +0,0 @@
|
||||
|
||||
|
||||
#### INITIALIZE
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn import set_config
|
||||
set_config(transform_output="pandas") # dataframe supremacy
|
||||
|
||||
# load data
|
||||
# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
|
||||
dataset = pd.read_csv("./games_march2025_cleaned_2k.csv",sep=",")
|
||||
print(dataset.head())
|
||||
|
||||
|
||||
|
||||
|
||||
#### DROP UNIQUES
|
||||
print("DROP")
|
||||
|
||||
#TODO: wird eh unten beim transformer deleted
|
||||
|
||||
# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
|
||||
#dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email',
|
||||
# 'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'],
|
||||
# axis=1, inplace=True)
|
||||
#print(dataset.head())
|
||||
|
||||
#### STRUCTURIZE AND STANDARDIZE
|
||||
print("STRUCTURE")
|
||||
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
|
||||
|
||||
# desc, genres, tags
|
||||
column_transformer = ColumnTransformer([
|
||||
# merge all descriptions
|
||||
('desc', FunctionTransformer(lambda X: X.fillna('').agg(' '.join, axis=1).to_frame(name="desc")),
|
||||
['detailed_description', 'about_the_game', 'short_description']),
|
||||
# genre -> actual genre, but very coarse
|
||||
# tags -> user defined tags; title num list
|
||||
#TODO: decide whether we drop tags
|
||||
('pass', 'passthrough', ['genres']),#, 'tags'
|
||||
],
|
||||
verbose_feature_names_out=False
|
||||
)
|
||||
dataset = column_transformer.fit_transform(dataset)
|
||||
print(dataset)
|
||||
|
||||
|
||||
|
||||
#### SET MISSING VALUES
|
||||
print("SETMISS")
|
||||
|
||||
|
||||
# Setting missing numeric values to the mean
|
||||
dataset.fillna(dataset.mean(numeric_only=True), inplace=True)
|
||||
# Setting missing text values to 'Unknown'
|
||||
dataset.fillna('', inplace=True)
|
||||
# Setting missing values in other columns to NaN
|
||||
dataset.dropna(inplace=True)
|
||||
|
||||
|
||||
|
||||
|
||||
##### STRUCTURIZE GENRES to onehot
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
import ast
|
||||
#serialize array
|
||||
dataset['genres'] = dataset['genres'].map(lambda s: ast.literal_eval(s))
|
||||
print(dataset['genres']) # in py but not yet onehotenc
|
||||
|
||||
# MultiLabelBinarizer does onehotenc for arrays
|
||||
mlb_genres = MultiLabelBinarizer()
|
||||
genres_encoded = mlb_genres.fit_transform(dataset.pop('genres'))
|
||||
genres_count = len(mlb_genres.classes_) # for multi-label classifiction later
|
||||
|
||||
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
|
||||
print(genres_df)
|
||||
#dataset = pd.concat([dataset, genres_df], axis=1)
|
||||
#print(dataset)
|
||||
|
||||
|
||||
#### convert text to bag of words
|
||||
|
||||
## Count vs Tfidf vectorizer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
vectorizer = TfidfVectorizer()
|
||||
tfidf_matrix = vectorizer.fit_transform(dataset['desc']) # matrix
|
||||
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
|
||||
print(tfidf_df)
|
||||
|
||||
|
||||
##### MODEL
|
||||
print("MODEL")
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.multioutput import MultiOutputClassifier
|
||||
from sklearn.metrics import classification_report
|
||||
|
||||
|
||||
X = tfidf_df
|
||||
y = genres_df
|
||||
|
||||
|
||||
# cleanup datapoints that dont have a target value (all target columns are 0)
|
||||
mask = y.sum(axis=1).map(lambda x: x > 0)
|
||||
#print((mask == False).sum()) #31 cases with all target columns 0
|
||||
X_clean = X[mask]
|
||||
y_clean = y[mask]
|
||||
|
||||
# Split dataset
|
||||
from sklearn.model_selection import train_test_split
|
||||
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, random_state=0)
|
||||
|
||||
|
||||
# we want to have multiple possible outputs (multi-label-classficiation) -> multioutputclassifier
|
||||
# logi regression is our base system
|
||||
# n_jobs=1 since there seems to be some multithreading join issue in sklearn (or my pc is too bad)
|
||||
multi_target_clf = MultiOutputClassifier(LogisticRegression(max_iter=1337, random_state=0), n_jobs=1)
|
||||
|
||||
# model training
|
||||
multi_target_clf.fit(X_train, y_train)
|
||||
|
||||
# predict against test data
|
||||
y_pred = multi_target_clf.predict(X_test)
|
||||
|
||||
# print prec, recall, f1 etc
|
||||
print(classification_report(y_test, y_pred, zero_division=0.0))
|
||||
|
||||
|
||||
#print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")
|
||||
Reference in New Issue
Block a user