From 1885f863711aef03bbc9c35a29e965f7dacdcaa3 Mon Sep 17 00:00:00 2001 From: FlorianSpeicher Date: Mon, 11 Aug 2025 23:45:16 +0200 Subject: [PATCH] Add test script and minor Notebook changes with Tim --- notebook.ipynb | 45 +++++++++++++++++---------- test_script.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 17 deletions(-) create mode 100644 test_script.py diff --git a/notebook.ipynb b/notebook.ipynb index c44ab63..44be09a 100644 --- a/notebook.ipynb +++ b/notebook.ipynb @@ -10,12 +10,15 @@ "The goal of this project is getting the genre(s) of a game trough its given metadata\n", "\n", "## Dataset\n", - "For our project we use a Steam DataSet from kaggle. You can find it under the following URL: [Kaggle.com](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data)" + "For our project we use a Steam dataSet from kaggle. You can find it under the following URL: [Kaggle.com](https://www.kaggle.com/datasets/artermiloff/steam-games-dataset/data)\n", + "\n", + "### Importing the dataSet\n", + "The dataSet is imported and added as a variable." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "3116b75f", "metadata": {}, "outputs": [ @@ -104,6 +107,7 @@ "- AppId\n", "- Name of the Game\n", "- Realease Date\n", + "- Reviews\n", "- Header Image\n", "- Website\n", "- Support URL\n", @@ -118,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "06dedcdf", "metadata": {}, "outputs": [ @@ -195,7 +199,9 @@ ], "source": [ "# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n", - "dataset.drop(['appid', 'name', 'release_date', 'header_image', 'website', 'support_url', 'support_email', 'metacritic_url', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], axis=1, inplace=True)\n", + "dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email',\n", + " 'metacritic_url', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'],\n", + " axis=1, inplace=True)\n", "print(dataset.head())" ] }, @@ -215,19 +221,24 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "id": "4e8b407c", "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "'numpy.ndarray' object has no attribute 'head'", + "ename": "ValueError", + "evalue": "all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 3 has size 9999", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 12\u001b[39m\n\u001b[32m 4\u001b[39m column_transformer = make_column_transformer(\n\u001b[32m 5\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mdetailed_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 6\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mabout_the_game\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 7\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mshort_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 8\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mreviews\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 9\u001b[39m )\n\u001b[32m 11\u001b[39m dataset = column_transformer.fit_transform(dataset)\n\u001b[32m---> \u001b[39m\u001b[32m12\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhead\u001b[49m())\n", - "\u001b[31mAttributeError\u001b[39m: 'numpy.ndarray' object has no attribute 'head'" + "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 11\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\u001b[39;00m\n\u001b[32m 4\u001b[39m column_transformer = make_column_transformer(\n\u001b[32m 5\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mdetailed_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 6\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mabout_the_game\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 7\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mshort_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 8\u001b[39m (\u001b[33m'\u001b[39m\u001b[33mpassthrough\u001b[39m\u001b[33m'\u001b[39m, [\u001b[33m'\u001b[39m\u001b[33mrequired_age\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mprice\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mdlc_count\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mreviews\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mwindows\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mmac\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mlinux\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mmetacritic_score\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33machievements\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mrecommendations\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mnotes\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33msupported_languages\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mfull_audio_languages\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mcategories\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mgenres\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33muser_score\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mscore_rank\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mpositive\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mnegative\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33maverage_playtime_forever\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33maverage_playtime_2weeks\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mmedian_playtime_forever\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mmedian_playtime_2weeks\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mdiscount\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mpeak_ccu\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mtags\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mpct_pos_total\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mnum_reviews_total\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mpct_pos_recent\u001b[39m\u001b[33m'\u001b[39m,\u001b[33m'\u001b[39m\u001b[33mnum_reviews_recent\u001b[39m\u001b[33m'\u001b[39m])\n\u001b[32m 9\u001b[39m )\n\u001b[32m---> \u001b[39m\u001b[32m11\u001b[39m dataset = \u001b[43mcolumn_transformer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 12\u001b[39m \u001b[38;5;28mprint\u001b[39m(dataset.head())\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_set_output.py:319\u001b[39m, in \u001b[36m_wrap_method_output..wrapped\u001b[39m\u001b[34m(self, X, *args, **kwargs)\u001b[39m\n\u001b[32m 317\u001b[39m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[32m 318\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, *args, **kwargs):\n\u001b[32m--> \u001b[39m\u001b[32m319\u001b[39m data_to_wrap = \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 320\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[32m 321\u001b[39m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[32m 322\u001b[39m return_tuple = (\n\u001b[32m 323\u001b[39m _wrap_data_with_container(method, data_to_wrap[\u001b[32m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[32m 324\u001b[39m *data_to_wrap[\u001b[32m1\u001b[39m:],\n\u001b[32m 325\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:1389\u001b[39m, in \u001b[36m_fit_context..decorator..wrapper\u001b[39m\u001b[34m(estimator, *args, **kwargs)\u001b[39m\n\u001b[32m 1382\u001b[39m estimator._validate_params()\n\u001b[32m 1384\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[32m 1385\u001b[39m skip_parameter_validation=(\n\u001b[32m 1386\u001b[39m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[32m 1387\u001b[39m )\n\u001b[32m 1388\u001b[39m ):\n\u001b[32m-> \u001b[39m\u001b[32m1389\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:1031\u001b[39m, in \u001b[36mColumnTransformer.fit_transform\u001b[39m\u001b[34m(self, X, y, **params)\u001b[39m\n\u001b[32m 1028\u001b[39m \u001b[38;5;28mself\u001b[39m._validate_output(Xs)\n\u001b[32m 1029\u001b[39m \u001b[38;5;28mself\u001b[39m._record_output_indices(Xs)\n\u001b[32m-> \u001b[39m\u001b[32m1031\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_hstack\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mXs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_samples\u001b[49m\u001b[43m=\u001b[49m\u001b[43mn_samples\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\sklearn\\compose\\_column_transformer.py:1225\u001b[39m, in \u001b[36mColumnTransformer._hstack\u001b[39m\u001b[34m(self, Xs, n_samples)\u001b[39m\n\u001b[32m 1215\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 1216\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mConcatenating DataFrames from the transformer\u001b[39m\u001b[33m'\u001b[39m\u001b[33ms output lead to\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1217\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m an inconsistent number of samples. The output may have Pandas\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m (...)\u001b[39m\u001b[32m 1220\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m samples.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1221\u001b[39m )\n\u001b[32m 1223\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m output\n\u001b[32m-> \u001b[39m\u001b[32m1225\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mnp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhstack\u001b[49m\u001b[43m(\u001b[49m\u001b[43mXs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\FlorianSpeicher\\anaconda3\\Lib\\site-packages\\numpy\\_core\\shape_base.py:364\u001b[39m, in \u001b[36mhstack\u001b[39m\u001b[34m(tup, dtype, casting)\u001b[39m\n\u001b[32m 362\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m _nx.concatenate(arrs, \u001b[32m0\u001b[39m, dtype=dtype, casting=casting)\n\u001b[32m 363\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m364\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_nx\u001b[49m\u001b[43m.\u001b[49m\u001b[43mconcatenate\u001b[49m\u001b[43m(\u001b[49m\u001b[43marrs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcasting\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcasting\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[31mValueError\u001b[39m: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 3 has size 9999" ] } ], @@ -239,7 +250,7 @@ " (TfidfVectorizer(stop_words='english'), ['detailed_description']),\n", " (TfidfVectorizer(stop_words='english'), ['about_the_game']),\n", " (TfidfVectorizer(stop_words='english'), ['short_description']),\n", - " (TfidfVectorizer(stop_words='english'), ['reviews']),\n", + " ('passthrough', ['required_age','price','dlc_count','reviews','windows','mac','linux','metacritic_score','achievements','recommendations','notes','supported_languages','full_audio_languages','categories','genres','user_score','score_rank','positive','negative','average_playtime_forever','average_playtime_2weeks','median_playtime_forever','median_playtime_2weeks','discount','peak_ccu','tags','pct_pos_total','num_reviews_total','pct_pos_recent','num_reviews_recent'])\n", ")\n", "\n", "dataset = column_transformer.fit_transform(dataset)\n", @@ -262,8 +273,8 @@ "id": "6a2a3d4f", "metadata": {}, "source": [ - "### Setting missing values\n", - "**TODO: Removing or Setting values that are not set or NaN in the DataSet**" + "### Handling missing values\n", + "Removing NaN values in the dataSet and setting missing numerical feature values to the mean feature count. Missing Text values are set to a default String `Unknown`." ] }, { @@ -287,12 +298,12 @@ "metadata": {}, "source": [ "# Data Split\n", - "**TODO splitting the Data into Train, test and validation data**" + "Splitting our dataSet to training and testing data. The relation is 80% training and 20% testing data." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "cfbf3787", "metadata": {}, "outputs": [ @@ -307,7 +318,7 @@ "source": [ "from sklearn.model_selection import train_test_split\n", "\n", - "# Annahme: 'genres' ist das Ziel/Label\n", + "# Setting the target feature 'genres' and dropping it from the dataset\n", "X = dataset.drop('genres', axis=1)\n", "y = dataset['genres']\n", "\n", @@ -315,7 +326,7 @@ " X, y, test_size=0.2, random_state=42\n", ")\n", "\n", - "print(f\"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}\")" + "print(f\"Training: {X_train.shape}, Testing: {X_test.shape}\")" ] }, { diff --git a/test_script.py b/test_script.py new file mode 100644 index 0000000..0acb908 --- /dev/null +++ b/test_script.py @@ -0,0 +1,84 @@ +import numpy as np +import pandas as pd + + +#### INITIALIZE + +# load data +# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent +dataset = pd.read_csv("./games_march2025_cleaned_10k.csv",sep=",") +print(dataset.head()) + + + + +#### DROP UNIQUES + +# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent +dataset.drop(['appid', 'name', 'release_date', 'reviews', 'header_image', 'website', 'support_url', 'support_email', + 'metacritic_url', 'notes', 'developers', 'publishers', 'screenshots', 'movies', 'estimated_owners'], + axis=1, inplace=True) +print(dataset.head()) + + + + + + +#### SET MISSING VALUES + + +# Setting missing numeric values to the mean +dataset.fillna(dataset.mean(numeric_only=True), inplace=True) +# Setting missing text values to 'Unknown' +dataset.fillna('Unknown', inplace=True) +# Setting missing values in other columns to NaN +dataset.dropna(inplace=True) + + + + + +#### STRUCTURIZE AND STANDARDIZE + +from sklearn.compose import make_column_transformer +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder + +# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent +column_transformer = make_column_transformer( + (TfidfVectorizer(stop_words='english'), ['detailed_description']), + (TfidfVectorizer(stop_words='english'), ['about_the_game']), + (TfidfVectorizer(stop_words='english'), ['short_description']), + (OneHotEncoder(), ['windows', 'mac', 'linux']), + (StandardScaler(), ['price']), + (FunctionTransformer(lambda x: x/100.0), ['metacritic_score']), + (StandardScaler(), ['achievements']), + (StandardScaler(), ['recommendations']), + #TODO: custom onehot encoder for these: + ('passthrough', ['supported_languages','full_audio_languages','categories','genres','tags']), + ('passthrough', ['required_age', 'dlc_count','user_score','score_rank','positive','negative','average_playtime_forever','average_playtime_2weeks','median_playtime_forever','median_playtime_2weeks','discount','peak_ccu','pct_pos_total','num_reviews_total','pct_pos_recent','num_reviews_recent']) +) + +dataset = column_transformer.fit_transform(dataset) +print(dataset.head()) + + + + + + +##### + + +from sklearn.model_selection import train_test_split + +# Annahme: 'genres' ist das Ziel/Label +X = dataset.drop('genres', axis=1) +y = dataset['genres'] + +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 +) + +print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}") \ No newline at end of file