Add Setting missing values and Data Split

This commit is contained in:
FlorianSpeicher
2025-08-11 23:01:20 +02:00
parent 3c055a1ce0
commit 4842b55e4f

View File

@@ -15,7 +15,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 4,
"id": "3116b75f", "id": "3116b75f",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -89,7 +89,7 @@
"\n", "\n",
"# load data\n", "# load data\n",
"# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n", "# appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,header_image,website,support_url,support_email,windows,mac,linux,metacritic_score,metacritic_url,achievements,recommendations,notes,supported_languages,full_audio_languages,packages,developers,publishers,categories,genres,screenshots,movies,user_score,score_rank,positive,negative,estimated_owners,average_playtime_forever,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent\n",
"dataset = pd.read_csv(\"./games_march2025_cleaned.csv\",sep=\",\")\n", "dataset = pd.read_csv(\"./games_march2025_cleaned_10k.csv\",sep=\",\")\n",
"print(dataset.head())" "print(dataset.head())"
] ]
}, },
@@ -118,7 +118,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 5,
"id": "06dedcdf", "id": "06dedcdf",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -215,15 +215,19 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"id": "4e8b407c", "id": "4e8b407c",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "ename": "AttributeError",
"output_type": "stream", "evalue": "'numpy.ndarray' object has no attribute 'head'",
"text": [ "output_type": "error",
"[[1. 1. 1. 1.]]\n" "traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 12\u001b[39m\n\u001b[32m 4\u001b[39m column_transformer = make_column_transformer(\n\u001b[32m 5\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mdetailed_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 6\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mabout_the_game\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 7\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mshort_description\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 8\u001b[39m (TfidfVectorizer(stop_words=\u001b[33m'\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m'\u001b[39m), [\u001b[33m'\u001b[39m\u001b[33mreviews\u001b[39m\u001b[33m'\u001b[39m]),\n\u001b[32m 9\u001b[39m )\n\u001b[32m 11\u001b[39m dataset = column_transformer.fit_transform(dataset)\n\u001b[32m---> \u001b[39m\u001b[32m12\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhead\u001b[49m())\n",
"\u001b[31mAttributeError\u001b[39m: 'numpy.ndarray' object has no attribute 'head'"
] ]
} }
], ],
@@ -238,8 +242,8 @@
" (TfidfVectorizer(stop_words='english'), ['reviews']),\n", " (TfidfVectorizer(stop_words='english'), ['reviews']),\n",
")\n", ")\n",
"\n", "\n",
"dataset2 = column_transformer.fit_transform(dataset)\n", "dataset = column_transformer.fit_transform(dataset)\n",
"print(dataset2)" "print(dataset.head())"
] ]
}, },
{ {
@@ -262,6 +266,21 @@
"**TODO: Removing or Setting values that are not set or NaN in the DataSet**" "**TODO: Removing or Setting values that are not set or NaN in the DataSet**"
] ]
}, },
{
"cell_type": "code",
"execution_count": 6,
"id": "dea7dc00",
"metadata": {},
"outputs": [],
"source": [
"# Setting missing numeric values to the mean\n",
"dataset.fillna(dataset.mean(numeric_only=True), inplace=True)\n",
"# Setting missing text values to 'Unknown'\n",
"dataset.fillna('Unknown', inplace=True)\n",
"# Setting missing values in other columns to NaN\n",
"dataset.dropna(inplace=True)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "091d7e13", "id": "091d7e13",
@@ -271,6 +290,34 @@
"**TODO splitting the Data into Train, test and validation data**" "**TODO splitting the Data into Train, test and validation data**"
] ]
}, },
{
"cell_type": "code",
"execution_count": 7,
"id": "cfbf3787",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Trainingsdaten: (7999, 33), Testdaten: (2000, 33)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Annahme: 'genres' ist das Ziel/Label\n",
"X = dataset.drop('genres', axis=1)\n",
"y = dataset['genres']\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.2, random_state=42\n",
")\n",
"\n",
"print(f\"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}\")"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "12b5283d", "id": "12b5283d",